*** empty log message ***
This commit is contained in:
parent
eb0aaf9adf
commit
074c943862
68
cuda_la.cc
68
cuda_la.cc
@ -1,2 +1,70 @@
|
||||
#include "la_traits.h"
|
||||
#include "cuda_la.h"
|
||||
|
||||
#ifdef CUDALA
|
||||
|
||||
namespace LA {
|
||||
|
||||
GPUID DEFAULT_LOC = cpu;
|
||||
|
||||
void set_default_loc(const GPUID loc)
|
||||
{
|
||||
DEFAULT_LOC = loc;
|
||||
}
|
||||
|
||||
void *gpualloc(size_t size)
|
||||
{
|
||||
cublasStatus status;
|
||||
void *ptr=NULL;
|
||||
status = cublasAlloc(size,1,&ptr);
|
||||
if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc");
|
||||
return ptr;
|
||||
}
|
||||
|
||||
|
||||
void gpufree(void *ptr)
|
||||
{
|
||||
cublasStatus status = cublasFree(ptr);
|
||||
if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasFree");
|
||||
}
|
||||
|
||||
void gpuget(size_t n, size_t elsize, const void *from, void *to)
|
||||
{
|
||||
cublasStatus status;
|
||||
status=cublasGetVector(n,elsize,from,1,to,1);
|
||||
if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasGetVector");
|
||||
}
|
||||
|
||||
void gpuput(size_t n, size_t elsize, const void *from, void *to)
|
||||
{
|
||||
cublasStatus status;
|
||||
status=cublasSetVector(n,elsize,from,1,to,1);
|
||||
if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector");
|
||||
}
|
||||
|
||||
double *gpuputdouble(const double &x)
|
||||
{
|
||||
cublasStatus status;
|
||||
void *ptr=NULL;
|
||||
status = cublasAlloc(1,sizeof(double),&ptr);
|
||||
if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc");
|
||||
status=cublasSetVector(1,sizeof(double),&x,1,ptr,1);
|
||||
if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector");
|
||||
return (double *)ptr;
|
||||
}
|
||||
|
||||
complex<double> *gpuputcomplex(const complex<double> &x)
|
||||
{
|
||||
cublasStatus status;
|
||||
void *ptr=NULL;
|
||||
status = cublasAlloc(1,sizeof(complex<double>),&ptr);
|
||||
if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc");
|
||||
status=cublasSetVector(1,sizeof(complex<double>),&x,1,ptr,1);
|
||||
if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector");
|
||||
return (complex<double> *)ptr;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
#endif
|
||||
|
56
cuda_la.h
Normal file
56
cuda_la.h
Normal file
@ -0,0 +1,56 @@
|
||||
#ifndef _CUDA_LA_H
|
||||
#define _CUDA_LA_H
|
||||
|
||||
#ifdef CUDALA
|
||||
#undef MATPTR
|
||||
#include "cublas.h"
|
||||
#endif
|
||||
|
||||
#include "la_traits.h"
|
||||
|
||||
namespace LA {
|
||||
|
||||
#ifdef CUDALA
|
||||
#define NOT_GPU(x) {if((x).getlocation()!=cpu) laerror("Operation not implemented on GPU (yet). Use .moveto(0) first.");}
|
||||
#define SAME_LOC(x,y) {if((x).getlocation()!=(y).getlocation()) laerror("Operands have different location. Use .moveto() first.");}
|
||||
#define SAME_LOC3(x,y,z) {if((x).getlocation()!=(y).getlocation() || (x).getlocation()!=(z).getlocation()) laerror("Operands have different location. Use .moveto() first.");}
|
||||
#else
|
||||
#define NOT_GPU(x) {}
|
||||
#define SAME_LOC(x,y) {}
|
||||
#define SAME_LOC3(x,y,z) {}
|
||||
#endif
|
||||
|
||||
typedef enum {undefined=-1, cpu=0, gpu1=1, gpu2=2, gpu3=3, gpu4=4} GPUID;
|
||||
|
||||
#ifdef CUDALA
|
||||
|
||||
//global static instantiation of this class will provide automatic init/shutdown of GPU
|
||||
class GPU_START {
|
||||
public:
|
||||
GPU_START(void)
|
||||
{
|
||||
cublasStatus status = cublasInit();
|
||||
if (status != CUBLAS_STATUS_SUCCESS) laerror("Cannot init GPU for CUBLAS");
|
||||
}
|
||||
~GPU_START(void)
|
||||
{
|
||||
cublasStatus status = cublasShutdown();
|
||||
if (status != CUBLAS_STATUS_SUCCESS) laerror("Cannot cleanly shutdown GPU");
|
||||
}
|
||||
};
|
||||
|
||||
extern void *gpualloc(size_t size);
|
||||
extern void gpufree(void *ptr);
|
||||
extern void gpuget(size_t n, size_t elsize, const void *from, void *to);
|
||||
extern void gpuput(size_t n, size_t elsize, const void *from, void *to);
|
||||
extern double *gpuputdouble(const double &x);
|
||||
extern complex<double> *gpuputcomplex(const complex<double> &x);
|
||||
|
||||
void set_default_loc(const GPUID loc);
|
||||
|
||||
extern GPUID DEFAULT_LOC;
|
||||
|
||||
|
||||
#endif
|
||||
}
|
||||
#endif
|
10
la_traits.h
10
la_traits.h
@ -40,6 +40,8 @@
|
||||
|
||||
#include "laerror.h"
|
||||
|
||||
#include "cuda_la.h"
|
||||
|
||||
#ifdef NONCBLAS
|
||||
#include "noncblas.h"
|
||||
#else
|
||||
@ -205,6 +207,8 @@ struct LA_traits_aux<complex<C>, scalar_true> {
|
||||
typedef complex<C> elementtype;
|
||||
typedef complex<C> producttype;
|
||||
typedef C normtype;
|
||||
typedef C realtype;
|
||||
typedef complex<C> complextype;
|
||||
static inline C sqrabs(const complex<C> x) { return x.real()*x.real()+x.imag()*x.imag();}
|
||||
static inline bool gencmp(const complex<C> *x, const complex<C> *y, int n) {return memcmp(x,y,n*sizeof(complex<C>));}
|
||||
static bool bigger(const complex<C> &x, const complex<C> &y) {laerror("complex comparison undefined"); return false;}
|
||||
@ -230,6 +234,8 @@ struct LA_traits_aux<C, scalar_true> {
|
||||
typedef C elementtype;
|
||||
typedef C producttype;
|
||||
typedef C normtype;
|
||||
typedef C realtype;
|
||||
typedef complex<C> complextype;
|
||||
static inline C sqrabs(const C x) { return x*x;}
|
||||
static inline bool gencmp(const C *x, const C *y, int n) {return memcmp(x,y,n*sizeof(C));}
|
||||
static inline bool bigger(const C &x, const C &y) {return x>y;}
|
||||
@ -261,6 +267,8 @@ struct LA_traits_aux<X<C>, scalar_false> { \
|
||||
typedef C elementtype; \
|
||||
typedef X<C> producttype; \
|
||||
typedef typename LA_traits<C>::normtype normtype; \
|
||||
typedef X<typename LA_traits<C>::realtype> realtype; \
|
||||
typedef X<typename LA_traits<C>::complextype> complextype; \
|
||||
static bool gencmp(const C *x, const C *y, int n) {for(int i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \
|
||||
static inline bool bigger(const C &x, const C &y) {return x>y;} \
|
||||
static inline bool smaller(const C &x, const C &y) {return x<y;} \
|
||||
@ -293,6 +301,8 @@ struct LA_traits_aux<X<C>, scalar_false> { \
|
||||
typedef C elementtype; \
|
||||
typedef NRMat<C> producttype; \
|
||||
typedef typename LA_traits<C>::normtype normtype; \
|
||||
typedef X<typename LA_traits<C>::realtype> realtype; \
|
||||
typedef X<typename LA_traits<C>::complextype> complextype; \
|
||||
static bool gencmp(const C *x, const C *y, int n) {for(int i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \
|
||||
static inline bool bigger(const C &x, const C &y) {return x>y;} \
|
||||
static inline bool smaller(const C &x, const C &y) {return x<y;} \
|
||||
|
13
laerror.cc
13
laerror.cc
@ -25,6 +25,7 @@
|
||||
#include <errno.h>
|
||||
#include <stdarg.h>
|
||||
|
||||
#include "cuda_la.h"
|
||||
|
||||
#ifdef USE_TRACEBACK
|
||||
#include "traceback.h"
|
||||
@ -32,6 +33,11 @@
|
||||
|
||||
namespace LA {
|
||||
|
||||
//enforce GPU initialization by a global class instantization constructor
|
||||
#ifdef CUDALA
|
||||
GPU_START gpu_start_instant;
|
||||
#endif
|
||||
|
||||
bool _LA_count_check=true;
|
||||
|
||||
extern "C" void _findme(void) {}; //for autoconf test we need a function with C linkage
|
||||
@ -45,6 +51,13 @@ void laerror(const char *s1)
|
||||
std::cerr << s1 << "\n";
|
||||
std::cout << s1 << "\n";
|
||||
}
|
||||
#ifdef CUDALA
|
||||
{
|
||||
cublasStatus s=cublasGetError();
|
||||
std::cerr << "CUBLAS status = "<<s<<std::endl;
|
||||
std::cout << "CUBLAS status = "<<s<<std::endl;
|
||||
}
|
||||
#endif
|
||||
if(errno) perror("system error");
|
||||
|
||||
throw LAerror(s1);
|
||||
|
322
mat.cc
322
mat.cc
@ -122,6 +122,15 @@ return r;
|
||||
template <typename T>
|
||||
void NRMat<T>::put(int fd, bool dim, bool transp) const
|
||||
{
|
||||
#ifdef CUDALA
|
||||
if(location!=cpu)
|
||||
{
|
||||
NRMat<T> tmp= *this;
|
||||
tmp.moveto(cpu);
|
||||
tmp.put(fd,dim,transp);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
errno=0;
|
||||
if(dim)
|
||||
{
|
||||
@ -153,6 +162,17 @@ else LA_traits<T>::multiput(nn*mm,fd,
|
||||
template <typename T>
|
||||
void NRMat<T>::get(int fd, bool dim, bool transp)
|
||||
{
|
||||
#ifdef CUDALA
|
||||
if(location!=cpu)
|
||||
{
|
||||
NRMat<T> tmp;
|
||||
tmp.moveto(cpu);
|
||||
tmp.get(fd,dim,transp);
|
||||
tmp.moveto(location);
|
||||
*this = tmp;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
int nn0,mm0;
|
||||
errno=0;
|
||||
if(dim)
|
||||
@ -188,6 +208,43 @@ else LA_traits<T>::multiget(nn*mm,fd,
|
||||
|
||||
|
||||
// Assign diagonal
|
||||
template <>
|
||||
NRMat<double> & NRMat<double>::operator=(const double &a)
|
||||
{
|
||||
copyonwrite();
|
||||
#ifdef DEBUG
|
||||
if (nn != mm) laerror("RMat.operator=scalar on non-square matrix");
|
||||
#endif
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
{
|
||||
#endif
|
||||
#ifdef MATPTR
|
||||
memset(v[0],0,nn*nn*sizeof(double));
|
||||
for (int i=0; i< nn; i++) v[i][i] = a;
|
||||
#else
|
||||
double n=0.;
|
||||
cblas_dcopy(nn*nn, &n, 0, v, 1);
|
||||
cblas_dcopy(nn, &a, 0, v, nn+1);
|
||||
#endif
|
||||
#ifdef CUDALA
|
||||
}
|
||||
else
|
||||
{
|
||||
double *d=gpuputdouble(0.);
|
||||
cublasDcopy(nn*nn, d, 0, v, 1);
|
||||
gpufree(d);
|
||||
d=gpuputdouble(a);
|
||||
cublasDcopy(nn, d, 0, v, nn+1);
|
||||
gpufree(d);
|
||||
}
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
template <typename T>
|
||||
NRMat<T> & NRMat<T>::operator=(const T &a)
|
||||
{
|
||||
@ -206,6 +263,64 @@ NRMat<T> & NRMat<T>::operator=(const T &a)
|
||||
}
|
||||
|
||||
|
||||
template <>
|
||||
NRMat<double> & NRMat<double>::operator+=(const double&a)
|
||||
{
|
||||
copyonwrite();
|
||||
#ifdef DEBUG
|
||||
if (nn != mm) laerror("Mat.operator+=scalar on non-square matrix");
|
||||
#endif
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
{
|
||||
#endif
|
||||
#ifdef MATPTR
|
||||
for (int i=0; i< nn; i++) v[i][i] += a;
|
||||
#else
|
||||
cblas_daxpy(nn, 1.0, &a, 0, *this, nn+1);
|
||||
#endif
|
||||
#ifdef CUDALA
|
||||
}
|
||||
else
|
||||
{
|
||||
double *d=gpuputdouble(a);
|
||||
cublasDaxpy(nn, 1.0, d, 0, *this, nn+1);
|
||||
gpufree(d);
|
||||
}
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
template <>
|
||||
NRMat<double> & NRMat<double>::operator-=(const double&a)
|
||||
{
|
||||
copyonwrite();
|
||||
#ifdef DEBUG
|
||||
if (nn != mm) laerror("Mat.operator+=scalar on non-square matrix");
|
||||
#endif
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
{
|
||||
#endif
|
||||
#ifdef MATPTR
|
||||
for (int i=0; i< nn; i++) v[i][i] -= a;
|
||||
#else
|
||||
cblas_daxpy(nn, -1.0, &a, 0, *this, nn+1);
|
||||
#endif
|
||||
#ifdef CUDALA
|
||||
}
|
||||
else
|
||||
{
|
||||
double *d=gpuputdouble(a);
|
||||
cublasDaxpy(nn, -1.0, d, 0, *this, nn+1);
|
||||
gpufree(d);
|
||||
}
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// M += a
|
||||
@ -240,6 +355,31 @@ NRMat<T> & NRMat<T>::operator-=(const T &a)
|
||||
return *this;
|
||||
}
|
||||
|
||||
template <>
|
||||
const NRMat<double> NRMat<double>::operator-() const
|
||||
{
|
||||
NRMat<double> result(nn, mm);
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
{
|
||||
#endif
|
||||
#ifdef MATPTR
|
||||
for (int i=0; i<nn*mm; i++) result.v[0][i]= -v[0][i];
|
||||
#else
|
||||
cblas_dscal(nn*mm, -1., v, 1);
|
||||
#endif
|
||||
#ifdef CUDALA
|
||||
}
|
||||
else
|
||||
{
|
||||
cublasDscal(nn*mm, -1., v, 1);
|
||||
}
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// unary minus
|
||||
template <typename T>
|
||||
const NRMat<T> NRMat<T>::operator-() const
|
||||
@ -253,6 +393,7 @@ const NRMat<T> NRMat<T>::operator-() const
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
// direct sum
|
||||
template <typename T>
|
||||
const NRMat<T> NRMat<T>::operator&(const NRMat<T> & b) const
|
||||
@ -540,7 +681,13 @@ template<>
|
||||
NRMat<double> & NRMat<double>::operator*=(const double &a)
|
||||
{
|
||||
copyonwrite();
|
||||
cblas_dscal(nn*mm, a, *this, 1);
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
#endif
|
||||
cblas_dscal(nn*mm, a, *this, 1);
|
||||
#ifdef CUDALA
|
||||
else cublasDscal(nn*mm, a, v, 1);
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
|
||||
@ -559,6 +706,7 @@ NRMat< complex<double> >::operator*=(const complex<double> &a)
|
||||
template <typename T>
|
||||
NRMat<T> & NRMat<T>::operator*=(const T &a)
|
||||
{
|
||||
NOT_GPU(*this);
|
||||
copyonwrite();
|
||||
#ifdef MATPTR
|
||||
for (int i=0; i< nn*mm; i++) v[0][i] *= a;
|
||||
@ -578,8 +726,16 @@ NRMat<double> & NRMat<double>::operator+=(const NRMat<double> &rhs)
|
||||
if (nn != rhs.nn || mm!= rhs.mm)
|
||||
laerror("Mat += Mat of incompatible matrices");
|
||||
#endif
|
||||
SAME_LOC(*this,rhs);
|
||||
copyonwrite();
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
#endif
|
||||
cblas_daxpy(nn*mm, 1.0, rhs, 1, *this, 1);
|
||||
#ifdef CUDALA
|
||||
else
|
||||
cublasDaxpy(nn*mm, 1.0, rhs, 1, v, 1);
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
|
||||
@ -625,8 +781,16 @@ NRMat<double> & NRMat<double>::operator-=(const NRMat<double> &rhs)
|
||||
if (nn != rhs.nn || mm!= rhs.mm)
|
||||
laerror("Mat -= Mat of incompatible matrices");
|
||||
#endif
|
||||
SAME_LOC(*this,rhs);
|
||||
copyonwrite();
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
#endif
|
||||
cblas_daxpy(nn*mm, -1.0, rhs, 1, *this, 1);
|
||||
#ifdef CUDALA
|
||||
else
|
||||
cublasDaxpy(nn*mm, -1.0, rhs, 1, v, 1);
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
|
||||
@ -836,9 +1000,18 @@ const NRMat<double> NRMat<double>::operator*(const NRMat<double> &rhs) const
|
||||
if (mm != rhs.nn) laerror("product of incompatible matrices");
|
||||
if (rhs.mm <=0) laerror("illegal matrix dimension in gemm");
|
||||
#endif
|
||||
NRMat<double> result(nn, rhs.mm);
|
||||
SAME_LOC(*this,rhs);
|
||||
|
||||
NRMat<double> result(nn, rhs.mm,rhs.getlocation());
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
#endif
|
||||
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, nn, rhs.mm, mm, 1.0,
|
||||
*this, mm, rhs, rhs.mm, 0.0, result, rhs.mm);
|
||||
#ifdef CUDALA
|
||||
else
|
||||
cublasDgemm('N','N',rhs.mm,nn,mm,1.0,rhs, rhs.mm,*this, mm, 0.0, result, rhs.mm);
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -991,12 +1164,21 @@ void NRMat<double>::gemm(const double &beta, const NRMat<double> &a,
|
||||
if (l!=nn || ll!=mm || k!=kk) laerror("incompatible matrices in Mat:gemm()");
|
||||
if(b.mm <=0 || mm<=0) laerror("illegal matrix dimension in gemm");
|
||||
#endif
|
||||
SAME_LOC3(*this,a,b);
|
||||
|
||||
if (alpha==0.0 && beta==1.0) return;
|
||||
|
||||
copyonwrite();
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
#endif
|
||||
cblas_dgemm(CblasRowMajor, (transa=='n' ? CblasNoTrans : CblasTrans),
|
||||
(transb=='n' ? CblasNoTrans : CblasTrans), nn, mm, k, alpha, a,
|
||||
a.mm, b , b.mm, beta, *this , mm);
|
||||
#ifdef CUDALA
|
||||
else
|
||||
cublasDgemm(transb,transa,mm,nn,k,alpha, b , b.mm, a,a.mm, beta, *this , mm);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -1028,7 +1210,20 @@ void NRMat< complex<double> >::gemm(const complex<double> & beta,
|
||||
template<>
|
||||
const double NRMat<double>::norm(const double scalar) const
|
||||
{
|
||||
if (!scalar) return cblas_dnrm2(nn*mm, (*this)[0], 1);
|
||||
if (!scalar)
|
||||
{
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
#endif
|
||||
return cblas_dnrm2(nn*mm, (*this)[0], 1);
|
||||
#ifdef CUDALA
|
||||
else
|
||||
return cublasDnrm2(nn*mm, v, 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
NOT_GPU(*this);
|
||||
|
||||
double sum = 0;
|
||||
for (int i=0; i<nn; i++)
|
||||
for (int j=0; j<mm; j++) {
|
||||
@ -1246,6 +1441,127 @@ else //vectors are columns
|
||||
|
||||
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
// for a matrix A(1:nn,1:mm) performs Fortran-like
|
||||
// operation A(nn:-1:1,:)
|
||||
//------------------------------------------------------------------------------
|
||||
template<>
|
||||
NRMat<double>& NRMat<double>::SwapRows(){
|
||||
copyonwrite();
|
||||
const int n_pul = this->nn / 2;
|
||||
double * const dataIn = this->v;
|
||||
|
||||
for(register int i=0; i<n_pul; i++){
|
||||
cblas_dswap(mm, dataIn + i*mm, 1, dataIn + (nn-i-1)*mm, 1);
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
//------------------------------------------------------------------------------
|
||||
template<>
|
||||
NRMat<complex<double> >& NRMat<complex<double> >::SwapRows(){
|
||||
copyonwrite();
|
||||
const int n = this->nn;
|
||||
const int m = this->mm;
|
||||
const int n_pul = this->nn / 2;
|
||||
complex<double> * const dataIn = this->v;
|
||||
|
||||
for(register int i=0; i<n_pul; i++){
|
||||
cblas_zswap(m, dataIn + i*m, 1, dataIn + (n-i-1)*m, 1);
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
//------------------------------------------------------------------------------
|
||||
template<typename T>
|
||||
NRMat<T>& NRMat<T>::SwapRows(){
|
||||
copyonwrite();
|
||||
const int n = this->nn;
|
||||
const int m = this->mm;
|
||||
const int n_pul = this->nn / 2;
|
||||
T * const dataIn = this->v;
|
||||
|
||||
for(register int i=0; i<n_pul; i++){
|
||||
const int offset1 = i*m;
|
||||
const int offset2 = (n-i-1)*m;
|
||||
|
||||
for(register int j=0;j<m;j++){
|
||||
dataIn[offset1 + j] = dataIn[offset2 + j];
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
//------------------------------------------------------------------------------
|
||||
// for a matrix A(1:nn,1:mm) performs Fortran-like
|
||||
// operation A(:,mm:-1:1)
|
||||
//------------------------------------------------------------------------------
|
||||
template<>
|
||||
NRMat<double>& NRMat<double>::SwapCols(){
|
||||
copyonwrite();
|
||||
const int n = this->nn;
|
||||
const int m = this->mm;
|
||||
const int m_pul = m / 2;
|
||||
double * const dataIn = this->v;
|
||||
|
||||
for(register int i=0; i<m_pul; i++){
|
||||
cblas_dswap(n, dataIn + i, m, dataIn + (m-i-1), m);
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
//------------------------------------------------------------------------------
|
||||
template<>
|
||||
NRMat<complex<double> >& NRMat<complex<double> >::SwapCols(){
|
||||
copyonwrite();
|
||||
const int n_pul = this->nn / 2;
|
||||
const int m_pul = this->mm / 2;
|
||||
complex<double> * const dataIn = this->v;
|
||||
|
||||
for(register int i=0; i<m_pul; i++){
|
||||
cblas_zswap(nn, dataIn + i, mm, dataIn + (mm-i-1), mm);
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
//------------------------------------------------------------------------------
|
||||
template<typename T>
|
||||
NRMat<T>& NRMat<T>::SwapCols(){
|
||||
copyonwrite();
|
||||
const int n_pul = nn / 2;
|
||||
const int m_pul = mm / 2;
|
||||
T * const dataIn = this->v;
|
||||
|
||||
for(register int i=0; i<m_pul; i++){
|
||||
for(register int j=0;j<nn;j++){
|
||||
const int jm = j*mm;
|
||||
dataIn[i + jm] = dataIn[(mm-i-1) + jm];
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
//------------------------------------------------------------------------------
|
||||
// for a matrix A(1:nn,1:mm) performs Fortran-like
|
||||
// operation A(nn:-1:1,mm:-1:1)
|
||||
//------------------------------------------------------------------------------
|
||||
template<typename T>
|
||||
NRMat<T>& NRMat<T>::SwapRowsCols(){
|
||||
this->copyonwrite();
|
||||
const int n = this->nn;
|
||||
const int m = this->mm;
|
||||
T * const dataIn = this->v;
|
||||
T * const dataOut = this->v;
|
||||
|
||||
const int Dim = n*m;
|
||||
for(register int i=0;i<n;i++){
|
||||
const int off = i*n;
|
||||
for(register int j=0;j<m;j++){
|
||||
const int offset = off + j;
|
||||
dataOut[Dim - (offset + 1)] = dataIn[offset];
|
||||
}
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
276
mat.h
276
mat.h
@ -33,12 +33,20 @@ protected:
|
||||
T *v;
|
||||
#endif
|
||||
int *count;
|
||||
#ifdef CUDALA
|
||||
GPUID location;
|
||||
#endif
|
||||
public:
|
||||
friend class NRVec<T>;
|
||||
friend class NRSMat<T>;
|
||||
|
||||
inline NRMat() : nn(0), mm(0), v(0), count(0) {};
|
||||
inline NRMat(const int n, const int m);
|
||||
inline NRMat() : nn(0), mm(0), v(0), count(0)
|
||||
{
|
||||
#ifdef CUDALA
|
||||
location = DEFAULT_LOC;
|
||||
#endif
|
||||
};
|
||||
inline NRMat(const int n, const int m ,const GPUID loc= undefined);
|
||||
inline NRMat(const T &a, const int n, const int m);
|
||||
NRMat(const T *a, const int n, const int m);
|
||||
inline NRMat(const NRMat &rhs);
|
||||
@ -57,6 +65,13 @@ public:
|
||||
#endif
|
||||
const bool operator==(const NRMat &rhs) const {return !(*this != rhs);};
|
||||
inline int getcount() const {return count?*count:0;}
|
||||
#ifdef CUDALA
|
||||
inline GPUID getlocation() const {return location;}
|
||||
void moveto(const GPUID dest);
|
||||
#else
|
||||
inline GPUID getlocation() const {return cpu;}
|
||||
void moveto(const GPUID dest) {};
|
||||
#endif
|
||||
NRMat & operator=(const NRMat &rhs); //assignment
|
||||
void randomize(const typename LA_traits<T>::normtype &x); //fill with random numbers
|
||||
NRMat & operator=(const T &a); //assign a to diagonal
|
||||
@ -88,7 +103,7 @@ public:
|
||||
const NRMat operator*(const NRSMat<T> &rhs) const; // Mat * Smat
|
||||
const NRMat operator&(const NRMat &rhs) const; // direct sum
|
||||
const NRMat operator|(const NRMat<T> &rhs) const; // direct product
|
||||
const NRVec<T> operator*(const NRVec<T> &rhs) const {NRVec<T> result(nn); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
|
||||
const NRVec<T> operator*(const NRVec<T> &rhs) const {NRVec<T> result(nn,rhs.getlocation()); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
|
||||
const NRVec<complex<T> > operator*(const NRVec<complex<T> > &rhs) const {NRVec<complex<T> > result(nn); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
|
||||
const NRVec<T> rsum() const; //sum of rows
|
||||
const NRVec<T> csum() const; //sum of columns
|
||||
@ -157,9 +172,14 @@ public:
|
||||
namespace LA {
|
||||
// ctors
|
||||
template <typename T>
|
||||
NRMat<T>::NRMat(const int n, const int m) : nn(n), mm(m), count(new int)
|
||||
NRMat<T>::NRMat(const int n, const int m, const GPUID loc) : nn(n), mm(m), count(new int)
|
||||
{
|
||||
*count = 1;
|
||||
#ifdef CUDALA
|
||||
location= (loc==undefined?DEFAULT_LOC:loc);
|
||||
if(location==cpu)
|
||||
{
|
||||
#endif
|
||||
#ifdef MATPTR
|
||||
v = new T*[n];
|
||||
v[0] = new T[m*n];
|
||||
@ -167,14 +187,29 @@ NRMat<T>::NRMat(const int n, const int m) : nn(n), mm(m), count(new int)
|
||||
#else
|
||||
v = new T[m*n];
|
||||
#endif
|
||||
#ifdef CUDALA
|
||||
}
|
||||
else
|
||||
{
|
||||
v= (T*) gpualloc(n*m*sizeof(T));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
NRMat<T>::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new int)
|
||||
{
|
||||
#ifdef CUDALA
|
||||
location=DEFAULT_LOC;
|
||||
#endif
|
||||
|
||||
int i;
|
||||
T *p;
|
||||
*count = 1;
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
{
|
||||
#endif
|
||||
#ifdef MATPTR
|
||||
v = new T*[n];
|
||||
p = v[0] = new T[m*n];
|
||||
@ -186,12 +221,29 @@ NRMat<T>::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new
|
||||
for (i=0; i< n*m; i++) *p++ = a;
|
||||
else
|
||||
memset(p, 0, n*m*sizeof(T));
|
||||
#ifdef CUDALA
|
||||
}
|
||||
else
|
||||
{
|
||||
v= (T*) gpualloc(n*m*sizeof(T));
|
||||
cublasSetVector(n*m,sizeof(T),&a,0,v,1);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
NRMat<T>::NRMat(const T *a, const int n, const int m) : nn(n), mm(m), count(new int)
|
||||
{
|
||||
#ifdef CUDALA
|
||||
location=DEFAULT_LOC;
|
||||
#endif
|
||||
|
||||
*count = 1;
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
{
|
||||
#endif
|
||||
#ifdef MATPTR
|
||||
v = new T*[n];
|
||||
v[0] = new T[m*n];
|
||||
@ -201,11 +253,25 @@ NRMat<T>::NRMat(const T *a, const int n, const int m) : nn(n), mm(m), count(new
|
||||
v = new T[m*n];
|
||||
memcpy(v, a, n*m*sizeof(T));
|
||||
#endif
|
||||
#ifdef CUDALA
|
||||
}
|
||||
else
|
||||
{
|
||||
v= (T*) gpualloc(n*m*sizeof(T));
|
||||
cublasSetVector(n*m,sizeof(T),a,1,v,1);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
//copy constructor
|
||||
template <typename T>
|
||||
NRMat<T>::NRMat(const NRMat &rhs)
|
||||
{
|
||||
#ifdef CUDALA
|
||||
location=rhs.location;
|
||||
#endif
|
||||
nn = rhs.nn;
|
||||
mm = rhs.mm;
|
||||
count = rhs.count;
|
||||
@ -213,9 +279,16 @@ NRMat<T>::NRMat(const NRMat &rhs)
|
||||
if (count) ++(*count);
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
NRMat<T>::NRMat(const NRSMat<T> &rhs)
|
||||
{
|
||||
NOT_GPU(rhs);
|
||||
|
||||
#ifdef CUDALA
|
||||
location=rhs.location;
|
||||
#endif
|
||||
|
||||
int i;
|
||||
nn = mm = rhs.nrows();
|
||||
count = new int;
|
||||
@ -244,6 +317,10 @@ NRMat<T>::NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset)
|
||||
{
|
||||
if (offset < 0 || n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");
|
||||
|
||||
#ifdef CUDALA
|
||||
location=rhs.location;
|
||||
#endif
|
||||
|
||||
nn = n;
|
||||
mm = m;
|
||||
count = rhs.count;
|
||||
@ -303,6 +380,7 @@ inline T & NRMat<T>::operator()(const int i, const int j)
|
||||
if (_LA_count_check && *count != 1) laerror("Mat lval use of (,) with count > 1");
|
||||
if (i<0 || i>=nn &&nn>0 || j<0 || j>=mm && mm>0) laerror("Mat (,) out of range");
|
||||
if (!v) laerror("(,) for unallocated Mat");
|
||||
NOT_GPU(*this);
|
||||
#endif
|
||||
#ifdef MATPTR
|
||||
return v[i][j];
|
||||
@ -310,12 +388,14 @@ inline T & NRMat<T>::operator()(const int i, const int j)
|
||||
return v[i*mm+j];
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline const T & NRMat<T>::operator()(const int i, const int j) const
|
||||
{
|
||||
#ifdef DEBUG
|
||||
if (i<0 || i>=nn&&nn>0 || j<0 || j>=mm&& mm>0) laerror("Mat (,) out of range");
|
||||
if (!v) laerror("(,) for unallocated Mat");
|
||||
NOT_GPU(*this); //in principle we could copy the element to CPU memory, yielding, however, a highly inneficient contruct
|
||||
#endif
|
||||
#ifdef MATPTR
|
||||
return v[i][j];
|
||||
@ -391,7 +471,7 @@ inline const complex<double> NRMat< complex<double> >::amax() const
|
||||
}
|
||||
|
||||
|
||||
//basi stuff to be available for any type ... must be in .h
|
||||
//basic stuff to be available for any type ... must be in .h
|
||||
// dtor
|
||||
template <typename T>
|
||||
NRMat<T>::~NRMat()
|
||||
@ -399,10 +479,21 @@ NRMat<T>::~NRMat()
|
||||
if (!count) return;
|
||||
if (--(*count) <= 0) {
|
||||
if (v) {
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
#endif
|
||||
{
|
||||
#ifdef MATPTR
|
||||
delete[] (v[0]);
|
||||
#endif
|
||||
delete[] v;
|
||||
}
|
||||
#ifdef CUDALA
|
||||
else
|
||||
{
|
||||
gpufree(v);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
delete count;
|
||||
}
|
||||
@ -415,14 +506,27 @@ NRMat<T> & NRMat<T>::operator=(const NRMat<T> &rhs)
|
||||
if (this !=&rhs)
|
||||
{
|
||||
if (count)
|
||||
if (--(*count) ==0 ) {
|
||||
if (--(*count) ==0 )
|
||||
{
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
{
|
||||
#endif
|
||||
#ifdef MATPTR
|
||||
delete[] (v[0]);
|
||||
#endif
|
||||
delete[] v;
|
||||
#ifdef CUDALA
|
||||
}
|
||||
else gpufree(v);
|
||||
#endif
|
||||
|
||||
delete count;
|
||||
}
|
||||
v = rhs.v;
|
||||
#ifdef CUDALA
|
||||
location=rhs.location;
|
||||
#endif
|
||||
nn = rhs.nn;
|
||||
mm = rhs.mm;
|
||||
count = rhs.count;
|
||||
@ -437,46 +541,8 @@ template <typename T>
|
||||
NRMat<T> & NRMat<T>::operator|=(const NRMat<T> &rhs)
|
||||
{
|
||||
if (this == &rhs) return *this;
|
||||
#ifdef DEBUG
|
||||
if (!rhs.v) laerror("unallocated rhs in Mat operator |=");
|
||||
#endif
|
||||
if (count)
|
||||
if (*count > 1) {
|
||||
--(*count);
|
||||
nn = 0;
|
||||
mm = 0;
|
||||
count = 0;
|
||||
v = 0;
|
||||
}
|
||||
if (nn != rhs.nn || mm != rhs.mm) {
|
||||
if (v) {
|
||||
#ifdef MATPTR
|
||||
delete[] (v[0]);
|
||||
#endif
|
||||
delete[] (v);
|
||||
v = 0;
|
||||
}
|
||||
nn = rhs.nn;
|
||||
mm = rhs.mm;
|
||||
}
|
||||
if (!v) {
|
||||
#ifdef MATPTR
|
||||
v = new T*[nn];
|
||||
v[0] = new T[mm*nn];
|
||||
#else
|
||||
v = new T[mm*nn];
|
||||
#endif
|
||||
}
|
||||
#ifdef MATPTR
|
||||
for (int i=1; i< nn; i++) v[i] = v[i-1] + mm;
|
||||
memcpy(v[0], rhs.v[0], nn*mm*sizeof(T));
|
||||
#else
|
||||
memcpy(v, rhs.v, nn*mm*sizeof(T));
|
||||
#endif
|
||||
|
||||
if (!count) count = new int;
|
||||
*count = 1;
|
||||
|
||||
*this = rhs;
|
||||
this->copyonwrite();
|
||||
return *this;
|
||||
}
|
||||
|
||||
@ -486,9 +552,13 @@ void NRMat<T>::copyonwrite()
|
||||
{
|
||||
if (!count) laerror("Mat::copyonwrite of undefined matrix");
|
||||
if (*count > 1) {
|
||||
(*count)--;
|
||||
count = new int;
|
||||
*count = 1;
|
||||
(*count)--;
|
||||
count = new int;
|
||||
*count = 1;
|
||||
#ifdef CUDALA
|
||||
if(location==cpu) //matrix is in CPU memory
|
||||
{
|
||||
#endif
|
||||
#ifdef MATPTR
|
||||
T **newv = new T*[nn];
|
||||
newv[0] = new T[mm*nn];
|
||||
@ -499,10 +569,21 @@ void NRMat<T>::copyonwrite()
|
||||
T *newv = new T[mm*nn];
|
||||
memcpy(newv, v, mm*nn*sizeof(T));
|
||||
v = newv;
|
||||
#endif
|
||||
#ifdef CUDALA
|
||||
}
|
||||
else //matrix is in GPU memory
|
||||
{
|
||||
T *newv = (T *) gpualloc(mm*nn*sizeof(T));
|
||||
if(sizeof(T)%sizeof(float)!=0) laerror("cpu memcpy alignment problem");
|
||||
cublasScopy(nn*mm*sizeof(T)/sizeof(float),(const float *) v,1,(float *)newv,1);
|
||||
v = newv;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
void NRMat<T>::resize(int n, int m)
|
||||
{
|
||||
@ -519,10 +600,18 @@ if(m==0) n=0;
|
||||
if(n==0 && m==0)
|
||||
{
|
||||
if(--(*count) <= 0) {
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
{
|
||||
#endif
|
||||
#ifdef MATPTR
|
||||
if(v) delete[] (v[0]);
|
||||
#endif
|
||||
if(v) delete[] v;
|
||||
#ifdef CUDALA
|
||||
}
|
||||
else gpufree(v);
|
||||
#endif
|
||||
delete count;
|
||||
}
|
||||
count=0;
|
||||
@ -543,6 +632,10 @@ if(m==0) n=0;
|
||||
*count = 1;
|
||||
nn = n;
|
||||
mm = m;
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
{
|
||||
#endif
|
||||
#ifdef MATPTR
|
||||
v = new T*[nn];
|
||||
v[0] = new T[m*n];
|
||||
@ -550,12 +643,22 @@ if(m==0) n=0;
|
||||
#else
|
||||
v = new T[m*n];
|
||||
#endif
|
||||
#ifdef CUDALA
|
||||
}
|
||||
else
|
||||
v = (T *) gpualloc(n*m*sizeof(T));
|
||||
#endif
|
||||
|
||||
return;
|
||||
}
|
||||
// At this point *count = 1, check if resize is necessary
|
||||
if (n!=nn || m!=mm) {
|
||||
nn = n;
|
||||
mm = m;
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
{
|
||||
#endif
|
||||
#ifdef MATPTR
|
||||
delete[] (v[0]);
|
||||
#endif
|
||||
@ -566,6 +669,14 @@ if(m==0) n=0;
|
||||
for (int i=1; i< n; i++) v[i] = v[i-1] + m;
|
||||
#else
|
||||
v = new T[m*n];
|
||||
#endif
|
||||
#ifdef CUDALA
|
||||
}
|
||||
else
|
||||
{
|
||||
gpufree(v);
|
||||
v=(T *) gpualloc(n*m*sizeof(T));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
@ -587,7 +698,11 @@ return r;
|
||||
// I/O
|
||||
template <typename T>
|
||||
std::ostream& operator<<(std::ostream &s, const NRMat<T> &x)
|
||||
{
|
||||
{
|
||||
#ifdef CUDALA
|
||||
if(x.getlocation()==cpu)
|
||||
{
|
||||
#endif
|
||||
int i,j,n,m;
|
||||
n=x.nrows();
|
||||
m=x.ncols();
|
||||
@ -597,18 +712,43 @@ std::ostream& operator<<(std::ostream &s, const NRMat<T> &x)
|
||||
for(j=0; j<m;j++) s << (typename LA_traits_io<T>::IOtype) x[i][j] << (j==m-1 ? '\n' : ' '); // endl cannot be used in the conditional expression, since it is an overloaded function
|
||||
}
|
||||
return s;
|
||||
}
|
||||
#ifdef CUDALA
|
||||
}
|
||||
else
|
||||
{
|
||||
NRMat<T> tmp=x;
|
||||
tmp.moveto(cpu);
|
||||
return s<<tmp;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::istream& operator>>(std::istream &s, NRMat<T> &x)
|
||||
{
|
||||
#ifdef CUDALA
|
||||
if(x.getlocation()==cpu)
|
||||
{
|
||||
#endif
|
||||
int i,j,n,m;
|
||||
s >> n >> m;
|
||||
x.resize(n,m);
|
||||
typename LA_traits_io<T>::IOtype tmp;
|
||||
for(i=0;i<n;i++) for(j=0; j<m;j++) { s>>tmp; x[i][j]=tmp;}
|
||||
return s;
|
||||
}
|
||||
#ifdef CUDALA
|
||||
}
|
||||
else
|
||||
{
|
||||
NRMat<T> tmp;
|
||||
tmp.moveto(cpu);
|
||||
s >> tmp;
|
||||
tmp.moveto(x.getlocation());
|
||||
x=tmp;
|
||||
return s;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
//optional indexing from 1
|
||||
@ -671,6 +811,38 @@ NRMat<T> & NRMat<T>::operator^=(const NRMat<T> &rhs){
|
||||
}
|
||||
|
||||
|
||||
#ifdef CUDALA
|
||||
template<typename T>
|
||||
void NRMat<T>::moveto(const GPUID dest)
|
||||
{
|
||||
if(location==dest) return;
|
||||
location=dest;
|
||||
|
||||
if(v && !count) laerror("internal inconsistency of reference counting 1");
|
||||
if (!count) return;
|
||||
|
||||
if(v && *count==0) laerror("internal inconsistency of reference counting 2");
|
||||
if(!v) return;
|
||||
|
||||
T *vold = v;
|
||||
|
||||
if(dest == cpu) //moving from GPU to CPU
|
||||
{
|
||||
v = new T[nn*mm];
|
||||
gpuget(nn*mm,sizeof(T),vold,v);
|
||||
if(*count == 1) gpufree(vold);
|
||||
else {--(*count); count = new int(1);}
|
||||
}
|
||||
else //moving from CPU to GPU
|
||||
{
|
||||
v=(T *) gpualloc(nn*mm*sizeof(T));
|
||||
gpuput(nn*mm,sizeof(T),vold,v);
|
||||
if(*count == 1) delete[] vold;
|
||||
else {--(*count); count = new int(1);}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
//end CUDALA
|
||||
|
||||
|
||||
|
||||
|
231
noncblas.cc
231
noncblas.cc
@ -20,106 +20,186 @@
|
||||
#include "noncblas.h"
|
||||
#include "laerror.h"
|
||||
#include "mat.h"
|
||||
#include "fortran.h"
|
||||
|
||||
|
||||
#ifdef FORTRAN_
|
||||
#define FORNAME(x) x##_
|
||||
#else
|
||||
#define FORNAME(x) x
|
||||
#endif
|
||||
|
||||
#ifdef NONCBLAS
|
||||
//Level 1 - straightforward wrappers
|
||||
|
||||
extern "C" double FORNAME(ddot) (const int *n, const double *x, const int *incx, const double *y, const int *incy);
|
||||
extern "C" double FORNAME(ddot) (const FINT *n, const double *x, const FINT *incx, const double *y, const FINT *incy);
|
||||
double cblas_ddot(const int N, const double *X, const int incX,
|
||||
const double *Y, const int incY)
|
||||
{
|
||||
#ifdef FORINT
|
||||
const FINT ntmp=N;
|
||||
const FINT incxtmp=incX;
|
||||
const FINT incytmp=incY;
|
||||
return FORNAME(ddot)(&ntmp,X,&incxtmp,Y,&incytmp);
|
||||
#else
|
||||
return FORNAME(ddot)(&N,X,&incX,Y,&incY);
|
||||
#endif
|
||||
}
|
||||
|
||||
extern "C" void FORNAME(dscal) (const int *n, const double *a, double *x, const int *incx);
|
||||
extern "C" void FORNAME(dscal) (const FINT *n, const double *a, double *x, const FINT *incx);
|
||||
void cblas_dscal(const int N, const double alpha, double *X, const int incX)
|
||||
{
|
||||
#ifdef FORINT
|
||||
const FINT ntmp=N;
|
||||
const FINT incxtmp=incX;
|
||||
FORNAME(dscal) (&ntmp,&alpha,X,&incxtmp);
|
||||
#else
|
||||
FORNAME(dscal) (&N,&alpha,X,&incX);
|
||||
#endif
|
||||
}
|
||||
|
||||
extern "C" void FORNAME(dcopy) (const int *n, const double *x, const int *incx, double *y, const int *incy);
|
||||
extern "C" void FORNAME(dcopy) (const FINT *n, const double *x, const FINT *incx, double *y, const FINT *incy);
|
||||
void cblas_dcopy(const int N, const double *X, const int incX,
|
||||
double *Y, const int incY)
|
||||
{
|
||||
#ifdef FORINT
|
||||
const FINT ntmp=N;
|
||||
const FINT incxtmp=incX;
|
||||
const FINT incytmp=incY;
|
||||
FORNAME(dcopy) (&ntmp,X,&incxtmp,Y,&incytmp);
|
||||
#else
|
||||
FORNAME(dcopy) (&N,X,&incX,Y,&incY);
|
||||
#endif
|
||||
}
|
||||
|
||||
extern "C" void FORNAME(daxpy) (const int *n, const double *a, const double *x, const int *incx, double *y, const int *incy);
|
||||
extern "C" void FORNAME(daxpy) (const FINT *n, const double *a, const double *x, const FINT *incx, double *y, const FINT *incy);
|
||||
void cblas_daxpy(const int N, const double alpha, const double *X,
|
||||
const int incX, double *Y, const int incY)
|
||||
{
|
||||
#ifdef FORINT
|
||||
const FINT ntmp=N;
|
||||
const FINT incxtmp=incX;
|
||||
const FINT incytmp=incY;
|
||||
FORNAME(daxpy) (&ntmp,&alpha,X,&incxtmp,Y,&incytmp);
|
||||
#else
|
||||
FORNAME(daxpy) (&N,&alpha,X,&incX,Y,&incY);
|
||||
#endif
|
||||
}
|
||||
|
||||
extern "C" double FORNAME(dnrm2) (const int *n, const double *x, const int *incx);
|
||||
extern "C" double FORNAME(dnrm2) (const FINT *n, const double *x, const FINT *incx);
|
||||
double cblas_dnrm2(const int N, const double *X, const int incX)
|
||||
{
|
||||
#ifdef FORINT
|
||||
const FINT ntmp=N;
|
||||
const FINT incxtmp=incX;
|
||||
return FORNAME(dnrm2) (&ntmp,X,&incxtmp);
|
||||
#else
|
||||
return FORNAME(dnrm2) (&N,X,&incX);
|
||||
#endif
|
||||
}
|
||||
|
||||
extern "C" double FORNAME(dasum) (const int *n, const double *x, const int *incx);
|
||||
extern "C" double FORNAME(dasum) (const FINT *n, const double *x, const FINT *incx);
|
||||
double cblas_dasum(const int N, const double *X, const int incX)
|
||||
{
|
||||
#ifdef FORINT
|
||||
const FINT ntmp=N;
|
||||
const FINT incxtmp=incX;
|
||||
return FORNAME(dasum) (&ntmp,X,&incxtmp);
|
||||
#else
|
||||
return FORNAME(dasum) (&N,X,&incX);
|
||||
#endif
|
||||
}
|
||||
|
||||
extern "C" void FORNAME(zcopy) (const int *n, const void *x, const int *incx, void *y, const int *incy);
|
||||
extern "C" void FORNAME(zcopy) (const FINT *n, const void *x, const FINT *incx, void *y, const FINT *incy);
|
||||
void cblas_zcopy(const int N, const void *X, const int incX,
|
||||
void *Y, const int incY)
|
||||
{
|
||||
#ifdef FORINT
|
||||
const FINT ntmp=N;
|
||||
const FINT incxtmp=incX;
|
||||
const FINT incytmp=incY;
|
||||
FORNAME(zcopy) (&ntmp,X,&incxtmp,Y,&incytmp);
|
||||
#else
|
||||
FORNAME(zcopy) (&N,X,&incX,Y,&incY);
|
||||
#endif
|
||||
}
|
||||
|
||||
extern "C" void FORNAME(zaxpy) (const int *n, const void *a, const void *x, const int *incx, void *y, const int *incy);
|
||||
extern "C" void FORNAME(zaxpy) (const FINT *n, const void *a, const void *x, const FINT *incx, void *y, const FINT *incy);
|
||||
void cblas_zaxpy(const int N, const void *alpha, const void *X,
|
||||
const int incX, void *Y, const int incY)
|
||||
{
|
||||
#ifdef FORINT
|
||||
const FINT ntmp=N;
|
||||
const FINT incxtmp=incX;
|
||||
const FINT incytmp=incY;
|
||||
FORNAME(zaxpy) (&ntmp,alpha,X,&incxtmp,Y,&incytmp);
|
||||
#else
|
||||
FORNAME(zaxpy) (&N,alpha,X,&incX,Y,&incY);
|
||||
#endif
|
||||
}
|
||||
|
||||
extern "C" void FORNAME(zscal) (const int *n, const void *a, void *x, const int *incx);
|
||||
extern "C" void FORNAME(zscal) (const FINT *n, const void *a, void *x, const FINT *incx);
|
||||
void cblas_zscal(const int N, const void *alpha, void *X, const int incX)
|
||||
{
|
||||
#ifdef FORINT
|
||||
const FINT ntmp=N;
|
||||
const FINT incxtmp=incX;
|
||||
FORNAME(zscal)(&ntmp,alpha,X,&incxtmp);
|
||||
#else
|
||||
FORNAME(zscal)(&N,alpha,X,&incX);
|
||||
#endif
|
||||
}
|
||||
|
||||
extern "C" void FORNAME(zdscal) (const int *n, const double *a, void *x, const int *incx);
|
||||
extern "C" void FORNAME(zdscal) (const FINT *n, const double *a, void *x, const FINT *incx);
|
||||
void cblas_zdscal(const int N, const double alpha, void *X, const int incX)
|
||||
{
|
||||
#ifdef FORINT
|
||||
const FINT ntmp=N;
|
||||
const FINT incxtmp=incX;
|
||||
FORNAME(zdscal)(&ntmp,&alpha,X,&incxtmp);
|
||||
#else
|
||||
FORNAME(zdscal)(&N,&alpha,X,&incX);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
extern "C" double FORNAME(dznrm2) (const int *n, const void *x, const int *incx);
|
||||
extern "C" double FORNAME(dznrm2) (const FINT *n, const void *x, const FINT *incx);
|
||||
double cblas_dznrm2(const int N, const void *X, const int incX)
|
||||
{
|
||||
#ifdef FORINT
|
||||
const FINT ntmp=N;
|
||||
const FINT incxtmp=incX;
|
||||
return FORNAME(dznrm2) (&ntmp,X,&incxtmp);
|
||||
#else
|
||||
return FORNAME(dznrm2) (&N,X,&incX);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
//the following ones are f2c-compatible, but is it truly portable???
|
||||
|
||||
extern "C" void FORNAME(zdotu) (void *retval, const int *n, const void *x, const int *incx, const void *y, const int *incy);
|
||||
extern "C" void FORNAME(zdotu) (void *retval, const FINT *n, const void *x, const FINT *incx, const void *y, const FINT *incy);
|
||||
|
||||
void cblas_zdotu_sub(const int N, const void *X, const int incX,
|
||||
const void *Y, const int incY, void *dotu)
|
||||
{
|
||||
#ifdef FORINT
|
||||
const FINT ntmp=N;
|
||||
const FINT incxtmp=incX;
|
||||
const FINT incytmp=incY;
|
||||
FORNAME(zdotu) (dotu,&ntmp,X,&incxtmp,Y,&incytmp);
|
||||
#else
|
||||
FORNAME(zdotu) (dotu,&N,X,&incX,Y,&incY);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
extern "C" void FORNAME(zdotc) (void *retval, const int *n, const void *x, const int *incx, const void *y, const int *incy);
|
||||
extern "C" void FORNAME(zdotc) (void *retval, const FINT *n, const void *x, const FINT *incx, const void *y, const FINT *incy);
|
||||
void cblas_zdotc_sub(const int N, const void *X, const int incX,
|
||||
const void *Y, const int incY, void *dotc)
|
||||
{
|
||||
#ifdef FORINT
|
||||
const FINT ntmp=N;
|
||||
const FINT incxtmp=incX;
|
||||
const FINT incytmp=incY;
|
||||
FORNAME(zdotc) (dotc,&ntmp,X,&incxtmp,Y,&incytmp);
|
||||
#else
|
||||
FORNAME(zdotc) (dotc,&N,X,&incX,Y,&incY);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -129,7 +209,7 @@ FORNAME(zdotc) (dotc,&N,X,&incX,Y,&incY);
|
||||
//enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, AtlasConj=114};
|
||||
|
||||
|
||||
extern "C" void FORNAME(dspmv) (const char *uplo, const int *n, const double *alpha, const double *ap, const double *x, const int *incx, const double *beta, double *y, const int *incy);
|
||||
extern "C" void FORNAME(dspmv) (const char *uplo, const FINT *n, const double *alpha, const double *ap, const double *x, const FINT *incx, const double *beta, double *y, const FINT *incy);
|
||||
void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
|
||||
const int N, const double alpha, const double *Ap,
|
||||
const double *X, const int incX,
|
||||
@ -137,11 +217,18 @@ void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
|
||||
{
|
||||
if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
|
||||
if(Uplo!=CblasLower) LA::laerror("CblasLower uplo asserted");
|
||||
#ifdef FORINT
|
||||
const FINT ntmp=N;
|
||||
const FINT incxtmp=incX;
|
||||
const FINT incytmp=incY;
|
||||
FORNAME(dspmv) ("U",&ntmp, &alpha, Ap, X, &incxtmp, &beta, Y, &incytmp);
|
||||
#else
|
||||
FORNAME(dspmv) ("U",&N, &alpha, Ap, X, &incX, &beta, Y, &incY);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
extern "C" void FORNAME(zhpmv) (const char *uplo, const int *n, const void *alpha, const void *ap, const void *x, const int *incx, const void *beta, void *y, const int *incy);
|
||||
extern "C" void FORNAME(zhpmv) (const char *uplo, const FINT *n, const void *alpha, const void *ap, const void *x, const FINT *incx, const void *beta, void *y, const FINT *incy);
|
||||
void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
|
||||
const int N, const void *alpha, const void *Ap,
|
||||
const void *X, const int incX,
|
||||
@ -149,20 +236,36 @@ void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
|
||||
{
|
||||
if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
|
||||
if(Uplo!=CblasLower) LA::laerror("CblasLower uplo asserted");
|
||||
#ifdef FORINT
|
||||
const FINT ntmp=N;
|
||||
const FINT incxtmp=incX;
|
||||
const FINT incytmp=incY;
|
||||
FORNAME(zhpmv) ("U",&ntmp, alpha, Ap, X, &incxtmp, beta, Y, &incytmp);
|
||||
#else
|
||||
FORNAME(zhpmv) ("U",&N, alpha, Ap, X, &incX, beta, Y, &incY);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
//Level 2 and Level 3 on general matrices - take into account the transposed storage of matrices in Fortran and C
|
||||
|
||||
extern "C" void FORNAME(dger) (const int *m, const int *n, const double *alpha, const double *x, const int *incx, const double *y, const int *incy, double *a, const int *lda);
|
||||
extern "C" void FORNAME(dger) (const FINT *m, const FINT *n, const double *alpha, const double *x, const FINT *incx, const double *y, const FINT *incy, double *a, const FINT *lda);
|
||||
void cblas_dger(const enum CBLAS_ORDER Order, const int M, const int N,
|
||||
const double alpha, const double *X, const int incX,
|
||||
const double *Y, const int incY, double *A, const int lda)
|
||||
{
|
||||
if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
|
||||
//swap m-n, y-x
|
||||
#ifdef FORINT
|
||||
const FINT mtmp=M;
|
||||
const FINT ntmp=N;
|
||||
const FINT incxtmp=incX;
|
||||
const FINT incytmp=incY;
|
||||
const FINT ldatmp=lda;
|
||||
FORNAME(dger) (&ntmp, &mtmp, &alpha, Y, &incytmp, X, &incxtmp, A, &ldatmp);
|
||||
#else
|
||||
FORNAME(dger) (&N, &M, &alpha, Y, &incY, X, &incX, A, &lda);
|
||||
#endif
|
||||
}
|
||||
|
||||
void cblas_zgerc(const enum CBLAS_ORDER Order, const int M, const int N,
|
||||
@ -179,7 +282,7 @@ void cblas_zgeru(const enum CBLAS_ORDER Order, const int M, const int N,
|
||||
LA::laerror("cblas_zgeru cannot be simply converted to fortran order");
|
||||
}
|
||||
|
||||
extern "C" void FORNAME(dgemm) (const char *transa, const char *transb, const int *m, const int *n, const int *k, const double *alpha, const double *a, const int *lda, const double *b, const int *ldb, const double *beta, double *c, const int *ldc);
|
||||
extern "C" void FORNAME(dgemm) (const char *transa, const char *transb, const FINT *m, const FINT *n, const FINT *k, const double *alpha, const double *a, const FINT *lda, const double *b, const FINT *ldb, const double *beta, double *c, const FINT *ldc);
|
||||
void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
|
||||
const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
|
||||
const int K, const double alpha, const double *A,
|
||||
@ -188,11 +291,22 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
|
||||
{
|
||||
if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
|
||||
//swap a-b, m-n
|
||||
#ifdef FORINT
|
||||
const FINT mtmp=M;
|
||||
const FINT ntmp=N;
|
||||
const FINT ktmp=K;
|
||||
const FINT ldatmp=lda;
|
||||
const FINT ldbtmp=ldb;
|
||||
const FINT ldctmp=ldc;
|
||||
FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T",
|
||||
&ntmp, &mtmp, &ktmp, &alpha, B, &ldbtmp, A, &ldatmp, &beta, C, &ldctmp);
|
||||
#else
|
||||
FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T",
|
||||
&N, &M, &K, &alpha, B, &ldb, A, &lda, &beta, C, &ldc);
|
||||
#endif
|
||||
}
|
||||
|
||||
extern "C" void FORNAME(zgemm) (const char *transa, const char *transb, const int *m, const int *n, const int *k, const void *alpha, const void *a, const int *lda, const void *b, const int *ldb, const void *beta, void *c, const int *ldc);
|
||||
extern "C" void FORNAME(zgemm) (const char *transa, const char *transb, const FINT *m, const FINT *n, const FINT *k, const void *alpha, const void *a, const FINT *lda, const void *b, const FINT *ldb, const void *beta, void *c, const FINT *ldc);
|
||||
void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
|
||||
const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
|
||||
const int K, const void *alpha, const void *A,
|
||||
@ -201,26 +315,49 @@ void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
|
||||
{
|
||||
if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
|
||||
//swap a-b, m-n
|
||||
#ifdef FORINT
|
||||
const FINT mtmp=M;
|
||||
const FINT ntmp=N;
|
||||
const FINT ktmp=K;
|
||||
const FINT ldatmp=lda;
|
||||
const FINT ldbtmp=ldb;
|
||||
const FINT ldctmp=ldc;
|
||||
FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
|
||||
TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
|
||||
&ntmp, &mtmp, &ktmp, alpha, B, &ldbtmp, A, &ldatmp, beta, C, &ldctmp);
|
||||
#else
|
||||
FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
|
||||
TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
|
||||
&N, &M, &K, alpha, B, &ldb, A, &lda, beta, C, &ldc);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
extern "C" void FORNAME(dgemv) (const char *TRANS, const int *M, const int *N, const double *ALPHA, const double *A, const int *LDA, const double *X, const int *INCX, const double *BETA, double *Y, const int *INCY);
|
||||
extern "C" void FORNAME(dgemv) (const char *TRANS, const FINT *M, const FINT *N, const double *ALPHA, const double *A, const FINT *LDA, const double *X, const FINT *INCX, const double *BETA, double *Y, const FINT *INCY);
|
||||
void cblas_dgemv(const enum CBLAS_ORDER Order,
|
||||
const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
|
||||
const double alpha, const double *A, const int lda,
|
||||
const double *X, const int incX, const double beta,
|
||||
double *Y, const int incY)
|
||||
{
|
||||
#ifdef FORINT
|
||||
const FINT mtmp=M;
|
||||
const FINT ntmp=N;
|
||||
const FINT ldatmp=lda;
|
||||
const FINT incxtmp=incX;
|
||||
const FINT incytmp=incY;
|
||||
if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
|
||||
//swap n-m and toggle transposition
|
||||
else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
|
||||
#else
|
||||
if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
|
||||
//swap n-m and toggle transposition
|
||||
else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
extern "C" void FORNAME(zgemv) (const char *TRANS, const int *M, const int *N, const void *ALPHA, const void *A, const int *LDA, const void *X, const int *INCX, const void *BETA, void *Y, const int *INCY);
|
||||
extern "C" void FORNAME(zgemv) (const char *TRANS, const FINT *M, const FINT *N, const void *ALPHA, const void *A, const FINT *LDA, const void *X, const FINT *INCX, const void *BETA, void *Y, const FINT *INCY);
|
||||
void cblas_zgemv(const enum CBLAS_ORDER Order,
|
||||
const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
|
||||
const void *alpha, const void *A, const int lda,
|
||||
@ -230,14 +367,29 @@ void cblas_zgemv(const enum CBLAS_ORDER Order,
|
||||
if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
|
||||
if(TransA == CblasConjTrans) LA::laerror("zgemv with CblasConjTrans not supportted");
|
||||
//swap n-m and toggle transposition
|
||||
#ifdef FORINT
|
||||
const FINT mtmp=M;
|
||||
const FINT ntmp=N;
|
||||
const FINT ldatmp=lda;
|
||||
const FINT incxtmp=incX;
|
||||
const FINT incytmp=incY;
|
||||
FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, alpha, A, &ldatmp, X, &incxtmp, beta, Y, &incytmp );
|
||||
#else
|
||||
FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, alpha, A, &lda, X, &incX, beta, Y, &incY );
|
||||
#endif
|
||||
}
|
||||
|
||||
extern "C" int FORNAME(idamax) (const int *N, const double *DX, const int *INCX);
|
||||
extern "C" FINT FORNAME(idamax) (const FINT *N, const double *DX, const FINT *INCX);
|
||||
|
||||
CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX)
|
||||
{
|
||||
return FORNAME(idamax)(&N,X,&incX);
|
||||
#ifdef FORINT
|
||||
const FINT ntmp=N;
|
||||
const FINT incxtmp=incX;
|
||||
return (CBLAS_INDEX)FORNAME(idamax)(&ntmp,X,&incxtmp);
|
||||
#else
|
||||
return (CBLAS_INDEX)FORNAME(idamax)(&N,X,&incX);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -246,21 +398,38 @@ return FORNAME(idamax)(&N,X,&incX);
|
||||
#ifdef NONCLAPACK
|
||||
//clapack_dgesv
|
||||
//allocate auxiliary storage and transpose input and output quantities to fortran/C order
|
||||
extern "C" void FORNAME(dgesv) (const int *N, const int *NRHS, double *A, const int *LDA, int *IPIV, double *B, const int *LDB, int *INFO);
|
||||
extern "C" void FORNAME(dgesv) (const FINT *N, const FINT *NRHS, double *A, const FINT *LDA, FINT *IPIV, double *B, const FINT *LDB, FINT *INFO);
|
||||
|
||||
int clapack_dgesv(const enum CBLAS_ORDER Order, const int N, const int NRHS,
|
||||
double *A, const int lda, int *ipiv,
|
||||
double *B, const int ldb)
|
||||
{
|
||||
int INFO=0;
|
||||
FINT INFO=0;
|
||||
if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
|
||||
//B should be in the same physical order, just transpose A in place and the LU result on output
|
||||
for(int i=1; i<N; ++i) for(int j=0; j<i; ++j) {double t=A[j*lda+i]; A[j*lda+i]=A[i*lda+j]; A[i*lda+j]=t;}
|
||||
#ifdef FORINT
|
||||
const FINT ntmp=N;
|
||||
const FINT nrhstmp=NRHS;
|
||||
const FINT ldatmp=lda;
|
||||
const FINT ldbtmp=ldb;
|
||||
FINT ipivtmp=*ipiv;
|
||||
FORNAME(dgesv) (&ntmp,&nrhstmp,A,&ldatmp,&ipivtmp,B,&ldbtmp,&INFO);
|
||||
#else
|
||||
FORNAME(dgesv) (&N,&NRHS,A,&lda,ipiv,B,&ldb,&INFO);
|
||||
#endif
|
||||
for(int i=1; i<N; ++i) for(int j=0; j<i; ++j) {double t=A[j*lda+i]; A[j*lda+i]=A[i*lda+j]; A[i*lda+j]=t;}
|
||||
return INFO;
|
||||
return (int)INFO;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
void cblas_dswap(const int N, double *X, const int incX,
|
||||
double *Y, const int incY){
|
||||
LA::laerror("cblas_dswap is not available... (-DNONCBLAS)");
|
||||
}
|
||||
|
||||
void cblas_zswap(const int N, void *X, const int incX,
|
||||
void *Y, const int incY){
|
||||
LA::laerror("cblas_zswap is not available... (-DNONCBLAS)");
|
||||
}
|
||||
|
319
nonclass.cc
319
nonclass.cc
@ -23,14 +23,11 @@
|
||||
#include "mat.h"
|
||||
#include "nonclass.h"
|
||||
#include "qsort.h"
|
||||
#include "fortran.h"
|
||||
|
||||
|
||||
namespace LA {
|
||||
|
||||
#ifdef FORTRAN_
|
||||
#define FORNAME(x) x##_
|
||||
#else
|
||||
#define FORNAME(x) x
|
||||
#endif
|
||||
|
||||
#define INSTANTIZE(T) \
|
||||
template void lawritemat(FILE *file,const T *a,int r,int c,const char *form0, \
|
||||
@ -191,16 +188,23 @@ linear_solve_do(A,&B[0],1,A.nrows(),det,n);
|
||||
// Next routines are not available in clapack, fotran ones will be used with an
|
||||
// additional swap/transpose of outputs when needed
|
||||
|
||||
extern "C" void FORNAME(dspsv)(const char *UPLO, const int *N, const int *NRHS,
|
||||
double *AP, int *IPIV, double *B, const int *LDB, int *INFO);
|
||||
extern "C" void FORNAME(dspsv)(const char *UPLO, const FINT *N, const FINT *NRHS,
|
||||
double *AP, FINT *IPIV, double *B, const FINT *LDB, FINT *INFO);
|
||||
|
||||
static void linear_solve_do(NRSMat<double> &a, double *b, const int nrhs, const int ldb, double *det, int n)
|
||||
{
|
||||
int r, *ipiv;
|
||||
FINT r, *ipiv;
|
||||
a.copyonwrite();
|
||||
ipiv = new int[n];
|
||||
ipiv = new FINT[n];
|
||||
char U = 'U';
|
||||
#ifdef FORINT
|
||||
const FINT ntmp=n;
|
||||
const FINT nrhstmp=nrhs;
|
||||
const FINT ldbtmp=ldb;
|
||||
FORNAME(dspsv)(&U, &ntmp, &nrhstmp, a, ipiv, b, &ldbtmp,&r);
|
||||
#else
|
||||
FORNAME(dspsv)(&U, &n, &nrhs, a, ipiv, b, &ldb,&r);
|
||||
#endif
|
||||
if (r < 0) {
|
||||
delete[] ipiv;
|
||||
laerror("illegal argument in spsv() call of linear_solve()");
|
||||
@ -239,8 +243,8 @@ linear_solve_do(a,&B[0],1,a.nrows(),det,n);
|
||||
//other version of linear solver based on gesvx
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
extern "C" void FORNAME(zgesvx)(const char *fact, const char *trans, const int *n, const int *nrhs, complex<double> *A, const int *lda, complex<double> *AF, const int *ldaf, const int *ipiv, char *equed, double *R,double *C, complex<double> *B, const int *ldb, complex<double> *X, const int *ldx, double *rcond, double *ferr, double *berr, complex<double> *work, double *rwork, int *info);
|
||||
extern "C" void FORNAME(dgesvx)(const char *fact, const char *trans, const int *n, const int *nrhs, double *A, const int *lda, double *AF, const int *ldaf, const int *ipiv, char *equed, double *R,double *C, double *B, const int *ldb, double *X, const int *ldx, double *rcond, double *ferr, double *berr, double *work, int *iwork, int *info);
|
||||
extern "C" void FORNAME(zgesvx)(const char *fact, const char *trans, const FINT *n, const FINT *nrhs, complex<double> *A, const FINT *lda, complex<double> *AF, const FINT *ldaf, const FINT *ipiv, char *equed, double *R,double *C, complex<double> *B, const FINT *ldb, complex<double> *X, const FINT *ldx, double *rcond, double *ferr, double *berr, complex<double> *work, double *rwork, FINT *info);
|
||||
extern "C" void FORNAME(dgesvx)(const char *fact, const char *trans, const FINT *n, const FINT *nrhs, double *A, const FINT *lda, double *AF, const FINT *ldaf, const FINT *ipiv, char *equed, double *R,double *C, double *B, const FINT *ldb, double *X, const FINT *ldx, double *rcond, double *ferr, double *berr, double *work, FINT *iwork, FINT *info);
|
||||
//------------------------------------------------------------------------------
|
||||
// solves set of linear equations using dgesvx
|
||||
// input:
|
||||
@ -270,18 +274,18 @@ int linear_solve_x(NRMat<double> &_A, double *_B, const int _rhsCount, const int
|
||||
double *A;
|
||||
double * const _A_data = (double*)_A;
|
||||
|
||||
int info;
|
||||
const int nrhs = _rhsCount;
|
||||
const int n = _eqCount;
|
||||
int lda = A_cols;
|
||||
const int ldaf = lda;
|
||||
FINT info;
|
||||
const FINT nrhs = _rhsCount;
|
||||
const FINT n = _eqCount;
|
||||
FINT lda = A_cols;
|
||||
const FINT ldaf = lda;
|
||||
|
||||
double rcond;
|
||||
double ferr[nrhs], berr[nrhs], work[4*n];
|
||||
double R[n], C[n];
|
||||
|
||||
int *const iwork = new int[n];
|
||||
int *const ipiv = new int[n];
|
||||
FINT *const iwork = new FINT[n];
|
||||
FINT *const ipiv = new FINT[n];
|
||||
|
||||
double *X = new double[n*nrhs];
|
||||
double *AF = new double[ldaf*n];
|
||||
@ -312,7 +316,7 @@ int linear_solve_x(NRMat<double> &_A, double *_B, const int _rhsCount, const int
|
||||
if(_saveA){
|
||||
delete[] A;
|
||||
}
|
||||
return info;
|
||||
return (int)info;
|
||||
}
|
||||
//------------------------------------------------------------------------------
|
||||
// solves set of linear equations using zgesvx
|
||||
@ -343,18 +347,18 @@ int linear_solve_x(NRMat<complex<double> > &_A, complex<double> *_B, const int _
|
||||
complex<double> *A;
|
||||
complex<double> * const _A_data = (complex<double>*)_A;
|
||||
|
||||
int info;
|
||||
const int nrhs = _rhsCount;
|
||||
const int n = _eqCount;
|
||||
int lda = A_cols;
|
||||
const int ldaf = lda;
|
||||
FINT info;
|
||||
const FINT nrhs = _rhsCount;
|
||||
const FINT n = _eqCount;
|
||||
FINT lda = A_cols;
|
||||
const FINT ldaf = lda;
|
||||
|
||||
double rcond;
|
||||
double ferr[nrhs], berr[nrhs];
|
||||
double R[n], C[n], rwork[2*n];
|
||||
complex<double> work[2*n];
|
||||
|
||||
int *const ipiv = new int[n];
|
||||
FINT *const ipiv = new FINT[n];
|
||||
|
||||
complex<double> *X = new complex<double>[n*nrhs];
|
||||
complex<double> *AF = new complex<double>[ldaf*n];
|
||||
@ -386,7 +390,7 @@ int linear_solve_x(NRMat<complex<double> > &_A, complex<double> *_B, const int _
|
||||
if(_saveA){
|
||||
delete[] A;
|
||||
}
|
||||
return info;
|
||||
return (int)info;
|
||||
}
|
||||
//------------------------------------------------------------------------------
|
||||
// for given square matrices A, B computes X = AB^{-1} as follows
|
||||
@ -402,8 +406,8 @@ int linear_solve_x(NRMat<complex<double> > &_A, complex<double> *_B, const int _
|
||||
//------------------------------------------------------------------------------
|
||||
int multiply_by_inverse(NRMat<double> &_A, NRMat<double> &_B, bool _useEq, double *_rcond){
|
||||
|
||||
const int n = _A.nrows();
|
||||
const int m = _A.ncols();
|
||||
const FINT n = _A.nrows();
|
||||
const FINT m = _A.ncols();
|
||||
if(n != m || n != _B.nrows() || n != _B.ncols()){
|
||||
laerror("multiply_by_inverse: incompatible matrices");
|
||||
}
|
||||
@ -417,13 +421,13 @@ int multiply_by_inverse(NRMat<double> &_A, NRMat<double> &_B, bool _useEq, doubl
|
||||
double * const B = (double*)_B;
|
||||
_B.copyonwrite();//even if fact='N', call copyonwrite because the solution is going to be stored in _B
|
||||
|
||||
int info;
|
||||
FINT info;
|
||||
double rcond;
|
||||
double ferr[n], berr[n], work[4*n];
|
||||
double R[n], C[n];
|
||||
|
||||
int *const iwork = new int[n];
|
||||
int *const ipiv = new int[n];
|
||||
FINT *const iwork = new FINT[n];
|
||||
FINT *const ipiv = new FINT[n];
|
||||
|
||||
double *X = new double[n2];
|
||||
double *AF = new double[n2];
|
||||
@ -437,7 +441,7 @@ int multiply_by_inverse(NRMat<double> &_A, NRMat<double> &_B, bool _useEq, doubl
|
||||
delete[] iwork;delete[] ipiv;
|
||||
delete[] AF;delete[] X;
|
||||
|
||||
return info;
|
||||
return (int)info;
|
||||
}
|
||||
//------------------------------------------------------------------------------
|
||||
// for given square matrices A, B computes X = AB^{-1} as follows
|
||||
@ -453,8 +457,8 @@ int multiply_by_inverse(NRMat<double> &_A, NRMat<double> &_B, bool _useEq, doubl
|
||||
//------------------------------------------------------------------------------
|
||||
int multiply_by_inverse(NRMat<complex<double> > &_A, NRMat<complex<double> > &_B, bool _useEq, double *_rcond){
|
||||
|
||||
const int n = _A.nrows();
|
||||
const int m = _A.ncols();
|
||||
const FINT n = _A.nrows();
|
||||
const FINT m = _A.ncols();
|
||||
if(n != m || n != _B.nrows() || n != _B.ncols()){
|
||||
laerror("multiply_by_inverse: incompatible matrices");
|
||||
}
|
||||
@ -468,13 +472,13 @@ int multiply_by_inverse(NRMat<complex<double> > &_A, NRMat<complex<double> > &_B
|
||||
complex<double> * const B = (complex<double>*)_B;
|
||||
_B.copyonwrite();//even if fact='N', call copyonwrite because the solution is going to be stored in _B
|
||||
|
||||
int info;
|
||||
FINT info;
|
||||
double rcond;
|
||||
double ferr[n], berr[n];
|
||||
double R[n], C[n], rwork[2*n];
|
||||
complex<double> work[2*n];
|
||||
|
||||
int *const ipiv = new int[n];
|
||||
FINT *const ipiv = new FINT[n];
|
||||
|
||||
complex<double> *X = new complex<double>[n2];
|
||||
complex<double> *AF = new complex<double>[n2];
|
||||
@ -488,24 +492,24 @@ int multiply_by_inverse(NRMat<complex<double> > &_A, NRMat<complex<double> > &_B
|
||||
delete[] ipiv;
|
||||
delete[] AF;delete[] X;
|
||||
|
||||
return info;
|
||||
return (int)info;
|
||||
}
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
|
||||
|
||||
extern "C" void FORNAME(dsyev)(const char *JOBZ, const char *UPLO, const int *N,
|
||||
double *A, const int *LDA, double *W, double *WORK, const int *LWORK, int *INFO);
|
||||
extern "C" void FORNAME(dsyev)(const char *JOBZ, const char *UPLO, const FINT *N,
|
||||
double *A, const FINT *LDA, double *W, double *WORK, const FINT *LWORK, FINT *INFO);
|
||||
|
||||
extern "C" void FORNAME(dsygv)(const int *ITYPE, const char *JOBZ, const char *UPLO, const int *N,
|
||||
double *A, const int *LDA, double *B, const int *LDB, double *W, double *WORK, const int *LWORK, int *INFO);
|
||||
extern "C" void FORNAME(dsygv)(const FINT *ITYPE, const char *JOBZ, const char *UPLO, const FINT *N,
|
||||
double *A, const FINT *LDA, double *B, const FINT *LDB, double *W, double *WORK, const FINT *LWORK, FINT *INFO);
|
||||
|
||||
|
||||
// a will contain eigenvectors (columns if corder==1), w eigenvalues
|
||||
void diagonalize(NRMat<double> &a, NRVec<double> &w, const bool eivec,
|
||||
const bool corder, int n, NRMat<double> *b, const int itype)
|
||||
{
|
||||
int m = a.nrows();
|
||||
FINT m = a.nrows();
|
||||
if (m != a.ncols()) laerror("diagonalize() call with non-square matrix");
|
||||
if (a.nrows() != w.size())
|
||||
laerror("inconsistent dimension of eigenvalue vector in diagonalize()");
|
||||
@ -517,22 +521,38 @@ void diagonalize(NRMat<double> &a, NRVec<double> &w, const bool eivec,
|
||||
w.copyonwrite();
|
||||
if(b) b->copyonwrite();
|
||||
|
||||
int r = 0;
|
||||
FINT r = 0;
|
||||
char U ='U';
|
||||
char vectors = 'V';
|
||||
if (!eivec) vectors = 'N';
|
||||
int LWORK = -1;
|
||||
FINT LWORK = -1;
|
||||
double WORKX;
|
||||
int ldb=0; if(b) ldb=b->ncols();
|
||||
FINT ldb=0; if(b) ldb=b->ncols();
|
||||
|
||||
#ifdef FORINT
|
||||
const FINT itypetmp = itype;
|
||||
FINT ntmp = n;
|
||||
// First call is to determine size of workspace
|
||||
if(b) FORNAME(dsygv)(&itypetmp,&vectors, &U, &ntmp, a, &m, *b, &ldb, w, &WORKX, &LWORK, &r );
|
||||
else FORNAME(dsyev)(&vectors, &U, &ntmp, a, &m, w, &WORKX, &LWORK, &r );
|
||||
#else
|
||||
// First call is to determine size of workspace
|
||||
if(b) FORNAME(dsygv)(&itype,&vectors, &U, &n, a, &m, *b, &ldb, w, &WORKX, &LWORK, &r );
|
||||
else FORNAME(dsyev)(&vectors, &U, &n, a, &m, w, &WORKX, &LWORK, &r );
|
||||
LWORK = (int)WORKX;
|
||||
#endif
|
||||
|
||||
LWORK = (FINT)WORKX;
|
||||
double *WORK = new double[LWORK];
|
||||
if(b) FORNAME(dsygv)(&itype,&vectors, &U, &n, a, &m, *b,&ldb, w, WORK, &LWORK, &r );
|
||||
|
||||
#ifdef FORINT
|
||||
if(b) FORNAME(dsygv)(&itypetmp,&vectors, &U, &ntmp, a, &m, *b, &ldb, w, &WORKX, &LWORK, &r );
|
||||
else FORNAME(dsyev)(&vectors, &U, &ntmp, a, &m, w, &WORKX, &LWORK, &r );
|
||||
#else
|
||||
if(b) FORNAME(dsygv)(&itype,&vectors, &U, &n, a, &m, *b,&ldb, w, WORK, &LWORK, &r );
|
||||
else FORNAME(dsyev)(&vectors, &U, &n, a, &m, w, WORK, &LWORK, &r );
|
||||
delete[] WORK;
|
||||
#endif
|
||||
|
||||
delete[] WORK;
|
||||
if (vectors == 'V' && corder) a.transposeme(n);
|
||||
|
||||
if (r < 0) laerror("illegal argument in sygv/syev in diagonalize()");
|
||||
@ -541,18 +561,18 @@ void diagonalize(NRMat<double> &a, NRVec<double> &w, const bool eivec,
|
||||
|
||||
|
||||
|
||||
extern "C" void FORNAME(zheev)(const char *JOBZ, const char *UPLO, const int *N,
|
||||
complex<double> *A, const int *LDA, double *W, complex<double> *WORK, const int *LWORK, double *RWORK, int *INFO);
|
||||
extern "C" void FORNAME(zheev)(const char *JOBZ, const char *UPLO, const FINT *N,
|
||||
complex<double> *A, const FINT *LDA, double *W, complex<double> *WORK, const FINT *LWORK, double *RWORK, FINT *INFO);
|
||||
|
||||
extern "C" void FORNAME(zhegv)(const int *ITYPE, const char *JOBZ, const char *UPLO, const int *N,
|
||||
complex<double> *A, const int *LDA, complex<double> *B, const int *LDB, double *W, complex<double> *WORK, const int *LWORK, double *RWORK, int *INFO);
|
||||
extern "C" void FORNAME(zhegv)(const FINT *ITYPE, const char *JOBZ, const char *UPLO, const FINT *N,
|
||||
complex<double> *A, const FINT *LDA, complex<double> *B, const FINT *LDB, double *W, complex<double> *WORK, const FINT *LWORK, double *RWORK, FINT *INFO);
|
||||
|
||||
|
||||
// a will contain eigenvectors (columns if corder==1), w eigenvalues
|
||||
void diagonalize(NRMat<complex<double> > &a, NRVec<double> &w, const bool eivec,
|
||||
const bool corder, int n, NRMat<complex<double> > *b, const int itype)
|
||||
{
|
||||
int m = a.nrows();
|
||||
FINT m = a.nrows();
|
||||
if (m != a.ncols()) laerror("diagonalize() call with non-square matrix");
|
||||
if (a.nrows() != w.size())
|
||||
laerror("inconsistent dimension of eigenvalue vector in diagonalize()");
|
||||
@ -564,23 +584,38 @@ void diagonalize(NRMat<complex<double> > &a, NRVec<double> &w, const bool eivec,
|
||||
w.copyonwrite();
|
||||
if(b) b->copyonwrite();
|
||||
|
||||
int r = 0;
|
||||
FINT r = 0;
|
||||
char U ='U';
|
||||
char vectors = 'V';
|
||||
if (!eivec) vectors = 'N';
|
||||
int LWORK = -1;
|
||||
FINT LWORK = -1;
|
||||
complex<double> WORKX;
|
||||
int ldb=0; if(b) ldb=b->ncols();
|
||||
FINT ldb=0; if(b) ldb=b->ncols();
|
||||
|
||||
// First call is to determine size of workspace
|
||||
double *RWORK = new double[3*n+2];
|
||||
if(b) FORNAME(zhegv)(&itype,&vectors, &U, &n, a, &m, *b, &ldb, w, &WORKX, &LWORK, RWORK, &r );
|
||||
#ifdef FORINT
|
||||
const FINT itypetmp = itype;
|
||||
FINT ntmp = n;
|
||||
if(b) FORNAME(zhegv)(&itypetmp,&vectors, &U, &ntmp, a, &m, *b, &ldb, w, &WORKX, &LWORK, RWORK, &r );
|
||||
else FORNAME(zheev)(&vectors, &U, &ntmp, a, &m, w, &WORKX, &LWORK, RWORK, &r );
|
||||
#else
|
||||
if(b) FORNAME(zhegv)(&itype,&vectors, &U, &n, a, &m, *b, &ldb, w, &WORKX, &LWORK, RWORK, &r );
|
||||
else FORNAME(zheev)(&vectors, &U, &n, a, &m, w, &WORKX, &LWORK, RWORK, &r );
|
||||
LWORK = (int)WORKX.real();
|
||||
#endif
|
||||
|
||||
LWORK = (FINT)WORKX.real();
|
||||
complex<double> *WORK = new complex<double>[LWORK];
|
||||
if(b) FORNAME(zhegv)(&itype,&vectors, &U, &n, a, &m, *b,&ldb, w, WORK, &LWORK, RWORK, &r );
|
||||
|
||||
#ifdef FORINT
|
||||
if(b) FORNAME(zhegv)(&itypetmp,&vectors, &U, &ntmp, a, &m, *b, &ldb, w, &WORKX, &LWORK, RWORK, &r );
|
||||
else FORNAME(zheev)(&vectors, &U, &ntmp, a, &m, w, &WORKX, &LWORK, RWORK, &r );
|
||||
#else
|
||||
if(b) FORNAME(zhegv)(&itype,&vectors, &U, &n, a, &m, *b,&ldb, w, WORK, &LWORK, RWORK, &r );
|
||||
else FORNAME(zheev)(&vectors, &U, &n, a, &m, w, WORK, &LWORK, RWORK, &r );
|
||||
delete[] WORK;
|
||||
#endif
|
||||
|
||||
delete[] WORK;
|
||||
delete[] RWORK;
|
||||
if (vectors == 'V' && corder) a.transposeme(n);
|
||||
|
||||
@ -590,11 +625,11 @@ void diagonalize(NRMat<complex<double> > &a, NRVec<double> &w, const bool eivec,
|
||||
|
||||
|
||||
|
||||
extern "C" void FORNAME(dspev)(const char *JOBZ, const char *UPLO, const int *N,
|
||||
double *AP, double *W, double *Z, const int *LDZ, double *WORK, int *INFO);
|
||||
extern "C" void FORNAME(dspev)(const char *JOBZ, const char *UPLO, const FINT *N,
|
||||
double *AP, double *W, double *Z, const FINT *LDZ, double *WORK, FINT *INFO);
|
||||
|
||||
extern "C" void FORNAME(dspgv)(const int *ITYPE, const char *JOBZ, const char *UPLO, const int *N,
|
||||
double *AP, double *BP, double *W, double *Z, const int *LDZ, double *WORK, int *INFO);
|
||||
extern "C" void FORNAME(dspgv)(const FINT *ITYPE, const char *JOBZ, const char *UPLO, const FINT *N,
|
||||
double *AP, double *BP, double *W, double *Z, const FINT *LDZ, double *WORK, FINT *INFO);
|
||||
|
||||
|
||||
// v will contain eigenvectors, w eigenvalues
|
||||
@ -613,14 +648,21 @@ void diagonalize(NRSMat<double> &a, NRVec<double> &w, NRMat<double> *v,
|
||||
if(v) v->copyonwrite();
|
||||
if(b) b->copyonwrite();
|
||||
|
||||
int r = 0;
|
||||
FINT r = 0;
|
||||
char U = 'U';
|
||||
char job = v ? 'v' : 'n';
|
||||
|
||||
double *WORK = new double[3*n];
|
||||
int ldv=v?v->ncols():n;
|
||||
if(b) FORNAME(dspgv)(&itype,&job, &U, &n, a, *b, w, v?(*v)[0]:(double *)0, &ldv, WORK, &r );
|
||||
FINT ldv=v?v->ncols():n;
|
||||
#ifdef FORINT
|
||||
const FINT itypetmp = itype;
|
||||
FINT ntmp = n;
|
||||
if(b) FORNAME(dspgv)(&itypetmp,&job, &U, &ntmp, a, *b, w, v?(*v)[0]:(double *)0, &ldv, WORK, &r );
|
||||
else FORNAME(dspev)(&job, &U, &ntmp, a, w, v?(*v)[0]:(double *)0, &ldv, WORK, &r );
|
||||
#else
|
||||
if(b) FORNAME(dspgv)(&itype,&job, &U, &n, a, *b, w, v?(*v)[0]:(double *)0, &ldv, WORK, &r );
|
||||
else FORNAME(dspev)(&job, &U, &n, a, w, v?(*v)[0]:(double *)0, &ldv, WORK, &r );
|
||||
#endif
|
||||
delete[] WORK;
|
||||
if (v && corder) v->transposeme(n);
|
||||
|
||||
@ -629,11 +671,11 @@ void diagonalize(NRSMat<double> &a, NRVec<double> &w, NRMat<double> *v,
|
||||
}
|
||||
|
||||
|
||||
extern "C" void FORNAME(zhpev)(const char *JOBZ, const char *UPLO, const int *N,
|
||||
complex<double> *AP, double *W, complex<double> *Z, const int *LDZ, complex<double> *WORK, double *RWORK, int *INFO);
|
||||
extern "C" void FORNAME(zhpev)(const char *JOBZ, const char *UPLO, const FINT *N,
|
||||
complex<double> *AP, double *W, complex<double> *Z, const FINT *LDZ, complex<double> *WORK, double *RWORK, FINT *INFO);
|
||||
|
||||
extern "C" void FORNAME(zhpgv)(const int *ITYPE, const char *JOBZ, const char *UPLO, const int *N,
|
||||
complex<double> *AP, complex<double> *BP, double *W, complex<double> *Z, const int *LDZ, complex<double> *WORK, double *RWORK, int *INFO);
|
||||
extern "C" void FORNAME(zhpgv)(const FINT *ITYPE, const char *JOBZ, const char *UPLO, const FINT *N,
|
||||
complex<double> *AP, complex<double> *BP, double *W, complex<double> *Z, const FINT *LDZ, complex<double> *WORK, double *RWORK, FINT *INFO);
|
||||
|
||||
|
||||
// v will contain eigenvectors, w eigenvalues
|
||||
@ -652,15 +694,22 @@ void diagonalize(NRSMat<complex<double> > &a, NRVec<double> &w, NRMat<complex<do
|
||||
if(v) v->copyonwrite();
|
||||
if(b) b->copyonwrite();
|
||||
|
||||
int r = 0;
|
||||
FINT r = 0;
|
||||
char U = 'U';
|
||||
char job = v ? 'v' : 'n';
|
||||
|
||||
complex<double> *WORK = new complex<double>[2*n];
|
||||
double *RWORK = new double[3*n];
|
||||
int ldv=v?v->ncols():n;
|
||||
FINT ldv=v?v->ncols():n;
|
||||
#ifdef FORINT
|
||||
const FINT itypetmp = itype;
|
||||
FINT ntmp = n;
|
||||
if(b) FORNAME(zhpgv)(&itypetmp,&job, &U, &ntmp, a, *b, w, v?(*v)[0]:(complex<double> *)0, &ldv, WORK, RWORK, &r );
|
||||
else FORNAME(zhpev)(&job, &U, &ntmp, a, w, v?(*v)[0]:(complex<double> *)0, &ldv, WORK, RWORK, &r );
|
||||
#else
|
||||
if(b) FORNAME(zhpgv)(&itype,&job, &U, &n, a, *b, w, v?(*v)[0]:(complex<double> *)0, &ldv, WORK, RWORK, &r );
|
||||
else FORNAME(zhpev)(&job, &U, &n, a, w, v?(*v)[0]:(complex<double> *)0, &ldv, WORK, RWORK, &r );
|
||||
#endif
|
||||
delete[] WORK;
|
||||
delete[] RWORK;
|
||||
if (v && corder) v->transposeme(n);
|
||||
@ -671,17 +720,17 @@ void diagonalize(NRSMat<complex<double> > &a, NRVec<double> &w, NRMat<complex<do
|
||||
|
||||
|
||||
|
||||
extern "C" void FORNAME(dgesvd)(const char *JOBU, const char *JOBVT, const int *M,
|
||||
const int *N, double *A, const int *LDA, double *S, double *U, const int *LDU,
|
||||
double *VT, const int *LDVT, double *WORK, const int *LWORK, int *INFO );
|
||||
extern "C" void FORNAME(dgesvd)(const char *JOBU, const char *JOBVT, const FINT *M,
|
||||
const FINT *N, double *A, const FINT *LDA, double *S, double *U, const FINT *LDU,
|
||||
double *VT, const FINT *LDVT, double *WORK, const FINT *LWORK, FINT *INFO );
|
||||
|
||||
void singular_decomposition(NRMat<double> &a, NRMat<double> *u, NRVec<double> &s,
|
||||
NRMat<double> *v, const bool corder, int m, int n)
|
||||
{
|
||||
int m0 = a.nrows();
|
||||
int n0 = a.ncols();
|
||||
if(m<=0) m=m0;
|
||||
if(n<=0) n=n0;
|
||||
FINT m0 = a.nrows();
|
||||
FINT n0 = a.ncols();
|
||||
if(m<=0) m=(int)m0;
|
||||
if(n<=0) n=(int)n0;
|
||||
if(n>n0 || m>m0) laerror("bad dimension in singular_decomposition");
|
||||
if (u) if (m > u->nrows() || m> u->ncols())
|
||||
laerror("inconsistent dimension of U Mat in singular_decomposition()");
|
||||
@ -700,14 +749,30 @@ void singular_decomposition(NRMat<double> &a, NRMat<double> *u, NRVec<double> &s
|
||||
char jobu = u ? 'A' : 'N';
|
||||
char jobv = v ? 'A' : 'N';
|
||||
double work0;
|
||||
int lwork = -1;
|
||||
int r;
|
||||
FINT lwork = -1;
|
||||
FINT r;
|
||||
|
||||
#ifdef FORINT
|
||||
FINT ntmp = n;
|
||||
FINT mtmp = m;
|
||||
FORNAME(dgesvd)(&jobv, &jobu, &ntmp, &mtmp, a, &n0, s, v?(*v)[0]:0, &n0,
|
||||
u?(*u)[0]:0, &m0, &work0, &lwork, &r);
|
||||
#else
|
||||
FORNAME(dgesvd)(&jobv, &jobu, &n, &m, a, &n0, s, v?(*v)[0]:0, &n0,
|
||||
u?(*u)[0]:0, &m0, &work0, &lwork, &r);
|
||||
lwork = (int) work0;
|
||||
#endif
|
||||
|
||||
lwork = (FINT) work0;
|
||||
double *work = new double[lwork];
|
||||
|
||||
#ifdef FORINT
|
||||
FORNAME(dgesvd)(&jobv, &jobu, &ntmp, &mtmp, a, &n0, s, v?(*v)[0]:0, &n0,
|
||||
u?(*u)[0]:0, &m0, work, &lwork, &r);
|
||||
#else
|
||||
FORNAME(dgesvd)(&jobv, &jobu, &n, &m, a, &n0, s, v?(*v)[0]:0, &n0,
|
||||
u?(*u)[0]:0, &m0, work, &lwork, &r);
|
||||
#endif
|
||||
|
||||
delete[] work;
|
||||
if (v && corder) v->transposeme(n);
|
||||
|
||||
@ -719,14 +784,14 @@ void singular_decomposition(NRMat<double> &a, NRMat<double> *u, NRVec<double> &s
|
||||
//extern "C" void FORNAME(dgeqrf)(const int *M, const int *N, double *A, const int *LDA, double *TAU, double *WORK, int *LWORK, int *INFO);
|
||||
|
||||
|
||||
extern "C" void FORNAME(dgeev)(const char *JOBVL, const char *JOBVR, const int *N,
|
||||
double *A, const int *LDA, double *WR, double *WI, double *VL, const int *LDVL,
|
||||
double *VR, const int *LDVR, double *WORK, const int *LWORK, int *INFO );
|
||||
extern "C" void FORNAME(dgeev)(const char *JOBVL, const char *JOBVR, const FINT *N,
|
||||
double *A, const FINT *LDA, double *WR, double *WI, double *VL, const FINT *LDVL,
|
||||
double *VR, const FINT *LDVR, double *WORK, const FINT *LWORK, FINT *INFO );
|
||||
|
||||
extern "C" void FORNAME(dggev)(const char *JOBVL, const char *JOBVR, const int *N,
|
||||
double *A, const int *LDA, double *B, const int *LDB, double *WR, double *WI, double *WBETA,
|
||||
double *VL, const int *LDVL, double *VR, const int *LDVR,
|
||||
double *WORK, const int *LWORK, int *INFO );
|
||||
extern "C" void FORNAME(dggev)(const char *JOBVL, const char *JOBVR, const FINT *N,
|
||||
double *A, const FINT *LDA, double *B, const FINT *LDB, double *WR, double *WI, double *WBETA,
|
||||
double *VL, const FINT *LDVL, double *VR, const FINT *LDVR,
|
||||
double *WORK, const FINT *LWORK, FINT *INFO );
|
||||
|
||||
|
||||
//statics for sorting
|
||||
@ -802,23 +867,41 @@ void gdiagonalize(NRMat<double> &a, NRVec<double> &wr, NRVec<double> &wi,
|
||||
char jobvl = vl ? 'V' : 'N';
|
||||
char jobvr = vr ? 'V' : 'N';
|
||||
double work0;
|
||||
int lwork = -1;
|
||||
int r;
|
||||
int lda=a.ncols();
|
||||
int ldb=0;
|
||||
FINT lwork = -1;
|
||||
FINT r;
|
||||
FINT lda=a.ncols();
|
||||
FINT ldb=0;
|
||||
if(b) ldb=b->ncols();
|
||||
int ldvl= vl?vl->ncols():lda;
|
||||
int ldvr= vr?vr->ncols():lda;
|
||||
if(b) FORNAME(dggev)(&jobvr, &jobvl, &n, a, &lda, *b, &ldb, wr, wi, *beta, vr?vr[0]:(double *)0,
|
||||
FINT ldvl= vl?vl->ncols():lda;
|
||||
FINT ldvr= vr?vr->ncols():lda;
|
||||
|
||||
#ifdef FORINT
|
||||
FINT ntmp = n;
|
||||
if(b) FORNAME(dggev)(&jobvr, &jobvl, &ntmp, a, &lda, *b, &ldb, wr, wi, *beta, vr?vr[0]:(double *)0,
|
||||
&ldvr, vl?vl[0]:(double *)0, &ldvl, &work0, &lwork, &r);
|
||||
else FORNAME(dgeev)(&jobvr, &jobvl, &ntmp, a, &lda, wr, wi, vr?vr[0]:(double *)0,
|
||||
&ldvr, vl?vl[0]:(double *)0, &ldvl, &work0, &lwork, &r);
|
||||
#else
|
||||
if(b) FORNAME(dggev)(&jobvr, &jobvl, &n, a, &lda, *b, &ldb, wr, wi, *beta, vr?vr[0]:(double *)0,
|
||||
&ldvr, vl?vl[0]:(double *)0, &ldvl, &work0, &lwork, &r);
|
||||
else FORNAME(dgeev)(&jobvr, &jobvl, &n, a, &lda, wr, wi, vr?vr[0]:(double *)0,
|
||||
&ldvr, vl?vl[0]:(double *)0, &ldvl, &work0, &lwork, &r);
|
||||
lwork = (int) work0;
|
||||
#endif
|
||||
|
||||
lwork = (FINT) work0;
|
||||
double *work = new double[lwork];
|
||||
|
||||
#ifdef FORINT
|
||||
if(b) FORNAME(dggev)(&jobvr, &jobvl, &ntmp, a, &lda, *b, &ldb, wr, wi, *beta, vr?vr[0]:(double *)0,
|
||||
&ldvr, vl?vl[0]:(double *)0, &ldvl, work, &lwork, &r);
|
||||
else FORNAME(dgeev)(&jobvr, &jobvl, &ntmp, a, &lda, wr, wi, vr?vr[0]:(double *)0,
|
||||
&ldvr, vl?vl[0]:(double *)0, &ldvl, work, &lwork, &r);
|
||||
#else
|
||||
if(b) FORNAME(dggev)(&jobvr, &jobvl, &n, a, &lda, *b, &ldb, wr, wi, *beta, vr?vr[0]:(double *)0,
|
||||
&ldvr, vl?vl[0]:(double *)0, &ldvl, work, &lwork, &r);
|
||||
else FORNAME(dgeev)(&jobvr, &jobvl, &n, a, &lda, wr, wi, vr?vr[0]:(double *)0,
|
||||
&ldvr, vl?vl[0]:(double *)0, &ldvl, work, &lwork, &r);
|
||||
#endif
|
||||
delete[] work;
|
||||
|
||||
//std::cout <<"TEST dgeev\n"<<wr<<wi<<*vr<<*vl<<std::endl;
|
||||
@ -1208,16 +1291,16 @@ return r;
|
||||
|
||||
|
||||
//Cholesky interface
|
||||
extern "C" void FORNAME(dpotrf)(const char *UPLO, const int *N, double *A, const int *LDA, int *INFO);
|
||||
extern "C" void FORNAME(zpotrf)(const char *UPLO, const int *N, complex<double> *A, const int *LDA, int *INFO);
|
||||
extern "C" void FORNAME(dpotrf)(const char *UPLO, const FINT *N, double *A, const FINT *LDA, FINT *INFO);
|
||||
extern "C" void FORNAME(zpotrf)(const char *UPLO, const FINT *N, complex<double> *A, const FINT *LDA, FINT *INFO);
|
||||
|
||||
void cholesky(NRMat<double> &a, bool upper)
|
||||
{
|
||||
if(a.nrows()!=a.ncols()) laerror("matrix must be square in Cholesky");
|
||||
int lda=a.ncols();
|
||||
int n=a.nrows();
|
||||
FINT lda=a.ncols();
|
||||
FINT n=a.nrows();
|
||||
char uplo=upper?'U':'L';
|
||||
int info;
|
||||
FINT info;
|
||||
a.copyonwrite();
|
||||
FORNAME(dpotrf)(&uplo, &n, a, &lda, &info);
|
||||
if(info) {std::cerr << "Lapack error "<<info<<std::endl; laerror("error in Cholesky");}
|
||||
@ -1233,10 +1316,10 @@ else
|
||||
void cholesky(NRMat<complex<double> > &a, bool upper)
|
||||
{
|
||||
if(a.nrows()!=a.ncols()) laerror("matrix must be square in Cholesky");
|
||||
int lda=a.ncols();
|
||||
int n=a.nrows();
|
||||
FINT lda=a.ncols();
|
||||
FINT n=a.nrows();
|
||||
char uplo=upper?'U':'L';
|
||||
int info;
|
||||
FINT info;
|
||||
a.copyonwrite();
|
||||
a.transposeme();//switch to Fortran order
|
||||
FORNAME(zpotrf)(&uplo, &n, a, &lda, &info);
|
||||
@ -1250,14 +1333,14 @@ else
|
||||
|
||||
|
||||
//various norms
|
||||
extern "C" double FORNAME(zlange)( const char *NORM, const int *M, const int *N, complex<double> *A, const int *LDA, double *WORK);
|
||||
extern "C" double FORNAME(dlange)( const char *NORM, const int *M, const int *N, double *A, const int *LDA, double *WORK);
|
||||
extern "C" double FORNAME(zlange)( const char *NORM, const FINT *M, const FINT *N, complex<double> *A, const FINT *LDA, double *WORK);
|
||||
extern "C" double FORNAME(dlange)( const char *NORM, const FINT *M, const FINT *N, double *A, const FINT *LDA, double *WORK);
|
||||
|
||||
double MatrixNorm(NRMat<complex<double> > &A, const char norm)
|
||||
{
|
||||
const char TypNorm = (tolower(norm) == 'o')?'I':'O'; //switch c-order/fortran-order
|
||||
const int M = A.nrows();
|
||||
const int N = A.ncols();
|
||||
const FINT M = A.nrows();
|
||||
const FINT N = A.ncols();
|
||||
double work[M];
|
||||
const double ret = FORNAME(zlange)(&TypNorm, &M, &N, A[0], &M, &work[0]);
|
||||
return ret;
|
||||
@ -1266,8 +1349,8 @@ double MatrixNorm(NRMat<complex<double> > &A, const char norm)
|
||||
double MatrixNorm(NRMat<double > &A, const char norm)
|
||||
{
|
||||
const char TypNorm = (tolower(norm) == 'o')?'I':'O'; //switch c-order/fortran-order
|
||||
const int M = A.nrows();
|
||||
const int N = A.ncols();
|
||||
const FINT M = A.nrows();
|
||||
const FINT N = A.ncols();
|
||||
double work[M];
|
||||
const double ret = FORNAME(dlange)(&TypNorm, &M, &N, A[0], &M, &work[0]);
|
||||
return ret;
|
||||
@ -1276,15 +1359,15 @@ double MatrixNorm(NRMat<double > &A, const char norm)
|
||||
|
||||
|
||||
//condition number
|
||||
extern "C" void FORNAME(zgecon)( const char *norm, const int *n, complex<double> *A, const int *LDA, const double *anorm, double *rcond, complex<double> *work, double *rwork, int *info);
|
||||
extern "C" void FORNAME(dgecon)( const char *norm, const int *n, double *A, const int *LDA, const double *anorm, double *rcond, double *work, double *rwork, int *info);
|
||||
extern "C" void FORNAME(zgecon)( const char *norm, const FINT *n, complex<double> *A, const FINT *LDA, const double *anorm, double *rcond, complex<double> *work, double *rwork, FINT *info);
|
||||
extern "C" void FORNAME(dgecon)( const char *norm, const FINT *n, double *A, const FINT *LDA, const double *anorm, double *rcond, double *work, double *rwork, FINT *info);
|
||||
|
||||
double CondNumber(NRMat<complex<double> > &A, const char norm)
|
||||
{
|
||||
const char TypNorm = (tolower(norm) == 'o')?'I':'O'; //switch c-order/fortran-order
|
||||
const int N = A.nrows();
|
||||
const FINT N = A.nrows();
|
||||
double Norma(0.0), ret(0.0);
|
||||
int info;
|
||||
FINT info;
|
||||
complex<double> *work;
|
||||
double *rwork;
|
||||
|
||||
@ -1305,9 +1388,9 @@ double CondNumber(NRMat<complex<double> > &A, const char norm)
|
||||
double CondNumber(NRMat<double> &A, const char norm)
|
||||
{
|
||||
const char TypNorm = (tolower(norm) == 'o')?'I':'O'; //switch c-order/fortran-order
|
||||
const int N = A.nrows();
|
||||
const FINT N = A.nrows();
|
||||
double Norma(0.0), ret(0.0);
|
||||
int info;
|
||||
FINT info;
|
||||
double *work;
|
||||
double *rwork;
|
||||
|
||||
|
25
smat.cc
25
smat.cc
@ -44,6 +44,16 @@ namespace LA {
|
||||
template <typename T>
|
||||
void NRSMat<T>::put(int fd, bool dim, bool transp) const
|
||||
{
|
||||
#ifdef CUDALA
|
||||
if(location!=cpu)
|
||||
{
|
||||
NRSMat<T> tmp= *this;
|
||||
tmp.moveto(cpu);
|
||||
tmp.put(fd,dim,transp);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
errno=0;
|
||||
if(dim)
|
||||
{
|
||||
@ -56,6 +66,18 @@ LA_traits<T>::multiput(NN2,fd,v,dim);
|
||||
template <typename T>
|
||||
void NRSMat<T>::get(int fd, bool dim, bool transp)
|
||||
{
|
||||
#ifdef CUDALA
|
||||
if(location!=cpu)
|
||||
{
|
||||
NRSMat<T> tmp;
|
||||
tmp.moveto(cpu);
|
||||
tmp.get(fd,dim,transp);
|
||||
tmp.moveto(location);
|
||||
*this = tmp;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
int nn0[2]; //align at least 8-byte
|
||||
errno=0;
|
||||
if(dim)
|
||||
@ -402,6 +424,9 @@ cblas_dcopy(nn*(nn+1)/2,&rhs(0,0),1,((double *)v) + (imagpart?1:0),2);
|
||||
}
|
||||
|
||||
|
||||
//some template specializations leading to BLAS/CUBLAS calls
|
||||
|
||||
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
263
smat.h
263
smat.h
@ -29,12 +29,20 @@ protected:
|
||||
int nn;
|
||||
T *v;
|
||||
int *count;
|
||||
#ifdef CUDALA
|
||||
GPUID location;
|
||||
#endif
|
||||
public:
|
||||
friend class NRVec<T>;
|
||||
friend class NRMat<T>;
|
||||
|
||||
inline NRSMat() : nn(0),v(0),count(0) {};
|
||||
inline explicit NRSMat(const int n); // Zero-based array
|
||||
inline NRSMat() : nn(0),v(0),count(0)
|
||||
{
|
||||
#ifdef CUDALA
|
||||
location = DEFAULT_LOC;
|
||||
#endif
|
||||
};
|
||||
inline explicit NRSMat(const int n, const GPUID loc= undefined);// Zero-based array
|
||||
inline NRSMat(const T &a, const int n); //Initialize to constant
|
||||
inline NRSMat(const T *a, const int n); // Initialize to array
|
||||
inline NRSMat(const NRSMat &rhs); // Copy constructor
|
||||
@ -45,6 +53,13 @@ public:
|
||||
NRSMat & operator=(const NRSMat &rhs); //assignment
|
||||
void randomize(const typename LA_traits<T>::normtype &x);
|
||||
NRSMat & operator=(const T &a); //assign a to diagonal
|
||||
#ifdef CUDALA
|
||||
inline GPUID getlocation() const {return location;}
|
||||
void moveto(const GPUID dest);
|
||||
#else
|
||||
inline GPUID getlocation() const {return cpu;}
|
||||
void moveto(const GPUID dest) {};
|
||||
#endif
|
||||
const bool operator!=(const NRSMat &rhs) const {if(nn!=rhs.nn) return 1; return LA_traits<T>::gencmp(v,rhs.v,NN2);} //memcmp for scalars else elementwise
|
||||
const bool operator==(const NRSMat &rhs) const {return !(*this != rhs);};
|
||||
inline NRSMat & operator*=(const T &a);
|
||||
@ -65,8 +80,8 @@ public:
|
||||
const NRMat<T> operator*(const NRMat<T> &rhs) const; // SMat*Mat
|
||||
const T dot(const NRSMat &rhs) const; // Smat.Smat//@@@for complex do conjugate
|
||||
const T dot(const NRVec<T> &rhs) const; //Smat(as vec).vec //@@@for complex do conjugate
|
||||
const NRVec<T> operator*(const NRVec<T> &rhs) const {NRVec<T> result(nn); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
|
||||
const NRVec<complex<T> > operator*(const NRVec<complex<T> > &rhs) const {NRVec<complex<T> > result(nn); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
|
||||
const NRVec<T> operator*(const NRVec<T> &rhs) const {NRVec<T> result(nn,rhs.getlocation()); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
|
||||
const NRVec<complex<T> > operator*(const NRVec<complex<T> > &rhs) const {NRVec<complex<T> > result(nn,rhs.getlocation()); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
|
||||
const T* diagonalof(NRVec<T> &, const bool divide=0, bool cache=false) const; //get diagonal
|
||||
void gemv(const T beta, NRVec<T> &r, const char trans, const T alpha, const NRVec<T> &x) const {r.gemv(beta,*this,trans,alpha,x);};
|
||||
void gemv(const T beta, NRVec<complex<T> > &r, const char trans, const T alpha, const NRVec<complex<T> > &x) const {r.gemv(beta,*this,trans,alpha,x);};
|
||||
@ -108,29 +123,63 @@ namespace LA {
|
||||
|
||||
// ctors
|
||||
template <typename T>
|
||||
inline NRSMat<T>::NRSMat(const int n) : nn(n), v(new T[NN2]),
|
||||
count(new int) {*count = 1;}
|
||||
|
||||
template <typename T>
|
||||
inline NRSMat<T>::NRSMat(const T& a, const int n) : nn(n),
|
||||
v(new T[NN2]), count(new int)
|
||||
inline NRSMat<T>::NRSMat(const int n, const GPUID loc) : nn(n), count(new int(1))
|
||||
{
|
||||
*count =1;
|
||||
if(a != (T)0) for(int i=0; i<NN2; i++) v[i] = a;
|
||||
else memset(v, 0, NN2*sizeof(T));
|
||||
#ifdef CUDALA
|
||||
location= (loc==undefined?DEFAULT_LOC:loc);
|
||||
if(location==cpu)
|
||||
#endif
|
||||
v=new T[NN2];
|
||||
#ifdef CUDALA
|
||||
else v= (T*) gpualloc(NN2*sizeof(T));
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline NRSMat<T>::NRSMat(const T *a, const int n) : nn(n),
|
||||
v(new T[NN2]), count(new int)
|
||||
inline NRSMat<T>::NRSMat(const T& a, const int n) : nn(n), count(new int(1))
|
||||
{
|
||||
*count = 1;
|
||||
memcpy(v, a, NN2*sizeof(T));
|
||||
#ifdef CUDALA
|
||||
location=DEFAULT_LOC;
|
||||
if(location==cpu)
|
||||
#endif
|
||||
{
|
||||
v=new T[NN2];
|
||||
if(a != (T)0) for(int i=0; i<NN2; i++) v[i] = a;
|
||||
else memset(v, 0, NN2*sizeof(T));
|
||||
}
|
||||
#ifdef CUDALA
|
||||
else
|
||||
{
|
||||
v= (T*) gpualloc(NN2*sizeof(T));
|
||||
cublasSetVector(NN2,sizeof(T),&a,0,v,1);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline NRSMat<T>::NRSMat(const T *a, const int n) : nn(n), count(new int(1))
|
||||
{
|
||||
#ifdef CUDALA
|
||||
location=DEFAULT_LOC;
|
||||
if(location==cpu)
|
||||
#endif
|
||||
memcpy(v, a, NN2*sizeof(T));
|
||||
#ifdef CUDALA
|
||||
else
|
||||
{
|
||||
v= (T*) gpualloc(NN2*sizeof(T));
|
||||
cublasSetVector(NN2,sizeof(T),a,1,v,1);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
inline NRSMat<T>::NRSMat(const NRSMat<T> &rhs) //copy constructor
|
||||
{
|
||||
#ifdef CUDALA
|
||||
location=rhs.location;
|
||||
#endif
|
||||
v = rhs.v;
|
||||
nn = rhs.nn;
|
||||
count = rhs.count;
|
||||
@ -140,6 +189,9 @@ inline NRSMat<T>::NRSMat(const NRSMat<T> &rhs) //copy constructor
|
||||
template <typename T>
|
||||
NRSMat<T>::NRSMat(const NRVec<T> &rhs, const int n) // type conversion
|
||||
{
|
||||
#ifdef CUDALA
|
||||
location=rhs.location;
|
||||
#endif
|
||||
nn = n;
|
||||
#ifdef DEBUG
|
||||
if (NN2 != rhs.size())
|
||||
@ -150,6 +202,7 @@ NRSMat<T>::NRSMat(const NRVec<T> &rhs, const int n) // type conversion
|
||||
(*count)++;
|
||||
}
|
||||
|
||||
|
||||
// S *= a
|
||||
template<>
|
||||
inline NRSMat<double> & NRSMat<double>::operator*=(const double & a)
|
||||
@ -437,33 +490,31 @@ NRSMat<T>::~NRSMat()
|
||||
{
|
||||
if (!count) return;
|
||||
if (--(*count) <= 0) {
|
||||
if (v) delete[] (v);
|
||||
if (v)
|
||||
{
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
#endif
|
||||
delete[] v;
|
||||
#ifdef CUDALA
|
||||
else gpufree(v);
|
||||
#endif
|
||||
}
|
||||
delete count;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// assignment with a physical copy
|
||||
template <typename T>
|
||||
NRSMat<T> & NRSMat<T>::operator|=(const NRSMat<T> &rhs)
|
||||
{
|
||||
if (this != &rhs) {
|
||||
if(!rhs.v) laerror("unallocated rhs in NRSMat operator |=");
|
||||
if(count)
|
||||
if(*count > 1) { // detach from the other
|
||||
--(*count);
|
||||
nn = 0;
|
||||
count = 0;
|
||||
v = 0;
|
||||
}
|
||||
if (nn != rhs.nn) {
|
||||
if(v) delete [] (v);
|
||||
nn = rhs.nn;
|
||||
}
|
||||
if (!v) v = new T[NN2];
|
||||
if (!count) count = new int;
|
||||
*count = 1;
|
||||
memcpy(v, rhs.v, NN2*sizeof(T));
|
||||
}
|
||||
#ifdef DEBUG
|
||||
if (!rhs.v) laerror("unallocated rhs in NRSMat operator |=");
|
||||
#endif
|
||||
if (this == &rhs) return *this;
|
||||
*this = rhs;
|
||||
this->copyonwrite();
|
||||
return *this;
|
||||
}
|
||||
|
||||
@ -474,13 +525,24 @@ NRSMat<T> & NRSMat<T>::operator=(const NRSMat<T> & rhs)
|
||||
{
|
||||
if (this == & rhs) return *this;
|
||||
if (count)
|
||||
if(--(*count) == 0) {
|
||||
delete [] v;
|
||||
if(--(*count) == 0)
|
||||
{
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
#endif
|
||||
delete [] v;
|
||||
#ifdef CUDALA
|
||||
else
|
||||
gpufree(v);
|
||||
#endif
|
||||
delete count;
|
||||
}
|
||||
}
|
||||
v = rhs.v;
|
||||
nn = rhs.nn;
|
||||
count = rhs.count;
|
||||
#ifdef CUDALA
|
||||
location=rhs.location;
|
||||
#endif
|
||||
if (count) (*count)++;
|
||||
return *this;
|
||||
}
|
||||
@ -495,9 +557,24 @@ void NRSMat<T>::copyonwrite()
|
||||
(*count)--;
|
||||
count = new int;
|
||||
*count = 1;
|
||||
T *newv = new T[NN2];
|
||||
memcpy(newv, v, NN2*sizeof(T));
|
||||
v = newv;
|
||||
T *newv;
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
{
|
||||
#endif
|
||||
newv = new T[NN2];
|
||||
memcpy(newv, v, NN2*sizeof(T));
|
||||
#ifdef CUDALA
|
||||
}
|
||||
else
|
||||
{
|
||||
newv = (T *) gpualloc(NN2*sizeof(T));
|
||||
if(sizeof(T)%sizeof(float)!=0) laerror("cpu memcpy alignment problem");
|
||||
cublasScopy(NN2*sizeof(T)/sizeof(float),(const float *) v,1,(float *)newv,1);
|
||||
}
|
||||
#endif
|
||||
|
||||
v = newv;
|
||||
}
|
||||
}
|
||||
|
||||
@ -514,7 +591,16 @@ void NRSMat<T>::resize(const int n)
|
||||
if(n==0)
|
||||
{
|
||||
if(--(*count) <= 0) {
|
||||
if(v) delete[] (v);
|
||||
if(v) {
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
#endif
|
||||
delete[] (v);
|
||||
#ifdef CUDALA
|
||||
else
|
||||
gpufree(v);
|
||||
#endif
|
||||
}
|
||||
delete count;
|
||||
}
|
||||
count=0;
|
||||
@ -534,16 +620,71 @@ void NRSMat<T>::resize(const int n)
|
||||
count = new int;
|
||||
*count = 1;
|
||||
nn = n;
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
#endif
|
||||
v = new T[NN2];
|
||||
#ifdef CUDALA
|
||||
else
|
||||
v = (T*) gpualloc(NN2*sizeof(T));
|
||||
#endif
|
||||
|
||||
return;
|
||||
}
|
||||
if (n != nn) {
|
||||
nn = n;
|
||||
delete[] v;
|
||||
v = new T[NN2];
|
||||
nn = n;
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
#endif
|
||||
{
|
||||
delete[] v;
|
||||
v = new T[NN2];
|
||||
}
|
||||
#ifdef CUDALA
|
||||
else
|
||||
{
|
||||
gpufree(v);
|
||||
v = (T*) gpualloc(NN2*sizeof(T));
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CUDALA
|
||||
template<typename T>
|
||||
void NRSMat<T>::moveto(const GPUID dest)
|
||||
{
|
||||
if(location==dest) return;
|
||||
location=dest;
|
||||
|
||||
if(v && !count) laerror("internal inconsistency of reference counting 1");
|
||||
if (!count) return;
|
||||
|
||||
if(v && *count==0) laerror("internal inconsistency of reference counting 2");
|
||||
if(!v) return;
|
||||
|
||||
T *vold = v;
|
||||
|
||||
if(dest == cpu) //moving from GPU to CPU
|
||||
{
|
||||
v = new T[NN2];
|
||||
gpuget(NN2,sizeof(T),vold,v);
|
||||
if(*count == 1) gpufree(vold);
|
||||
else {--(*count); count = new int(1);}
|
||||
}
|
||||
else //moving from CPU to GPU
|
||||
{
|
||||
v=(T *) gpualloc(NN2*sizeof(T));
|
||||
gpuput(NN2,sizeof(T),vold,v);
|
||||
if(*count == 1) delete[] vold;
|
||||
else {--(*count); count = new int(1);}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
|
||||
template<typename T>
|
||||
NRSMat<complex<T> > complexify(const NRSMat<T> &rhs)
|
||||
@ -554,10 +695,15 @@ for(int i=0; i<rhs.nrows(); ++i)
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
// I/O
|
||||
template <typename T>
|
||||
std::ostream& operator<<(std::ostream &s, const NRSMat<T> &x)
|
||||
{
|
||||
#ifdef CUDALA
|
||||
if(x.getlocation()==cpu)
|
||||
{
|
||||
#endif
|
||||
int i,j,n;
|
||||
n=x.nrows();
|
||||
s << n << ' ' << n << '\n';
|
||||
@ -566,12 +712,25 @@ std::ostream& operator<<(std::ostream &s, const NRSMat<T> &x)
|
||||
for(j=0; j<n;j++) s << (typename LA_traits_io<T>::IOtype)x(i,j) << (j==n-1 ? '\n' : ' ');
|
||||
}
|
||||
return s;
|
||||
#ifdef CUDALA
|
||||
}
|
||||
else
|
||||
{
|
||||
NRSMat<T> tmp=x;
|
||||
tmp.moveto(cpu);
|
||||
return s<<tmp;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
std::istream& operator>>(std::istream &s, NRSMat<T> &x)
|
||||
{
|
||||
#ifdef CUDALA
|
||||
if(x.getlocation()==cpu)
|
||||
{
|
||||
#endif
|
||||
int i,j,n,m;
|
||||
s >> n >> m;
|
||||
if(n!=m) laerror("input symmetric matrix not square");
|
||||
@ -579,6 +738,18 @@ std::istream& operator>>(std::istream &s, NRSMat<T> &x)
|
||||
typename LA_traits_io<T>::IOtype tmp;
|
||||
for(i=0;i<n;i++) for(j=0; j<m;j++) {s>>tmp; x(i,j)=tmp;}
|
||||
return s;
|
||||
#ifdef CUDALA
|
||||
}
|
||||
else
|
||||
{
|
||||
NRSMat<T> tmp;
|
||||
tmp.moveto(cpu);
|
||||
s >> tmp;
|
||||
tmp.moveto(x.getlocation());
|
||||
x=tmp;
|
||||
return s;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
57
t.cc
57
t.cc
@ -24,6 +24,7 @@
|
||||
using namespace std;
|
||||
using namespace LA;
|
||||
|
||||
|
||||
extern void test(const NRVec<double> &x);
|
||||
|
||||
|
||||
@ -1551,7 +1552,7 @@ cout <<"Error = "<<(cx.transpose(true)*cx -bb).norm()<<endl;
|
||||
}
|
||||
|
||||
|
||||
if(1)
|
||||
if(0)
|
||||
{
|
||||
int n;
|
||||
cin >>n;
|
||||
@ -1574,5 +1575,59 @@ cout <<"Cholesky factor filling = "<<b.simplify()<<endl;
|
||||
}
|
||||
|
||||
|
||||
if(0)
|
||||
{
|
||||
int n;
|
||||
cin >>n;
|
||||
NRMat<double> a(n,n),b(n,n);
|
||||
a.randomize(1.);
|
||||
b.randomize(1.);
|
||||
NRMat<double>c;
|
||||
double t=clock()/((double) (CLOCKS_PER_SEC));
|
||||
int rep=1+10000000000LL/n/n/n;
|
||||
cout <<"Repetitions " <<rep<<endl;
|
||||
for(int i=0; i<rep; ++i) c=a*b;
|
||||
cout <<"CPU time (ms) "<<(clock()/((double) (CLOCKS_PER_SEC))-t)*1000./rep <<"\n";
|
||||
NRMat<double>cgpu;
|
||||
a.moveto(gpu1);
|
||||
b.moveto(gpu1);
|
||||
t=clock()/((double) (CLOCKS_PER_SEC));
|
||||
for(int i=0; i<rep; ++i) cgpu=a*b;
|
||||
cout <<"GPU time (ms) "<<(clock()/((double) (CLOCKS_PER_SEC))-t)*1000./rep <<"\n";
|
||||
cgpu.moveto(cpu);
|
||||
cout << "Error = "<<(c-cgpu).norm()<<endl;
|
||||
}
|
||||
|
||||
if(1)
|
||||
{
|
||||
int n;
|
||||
cin >>n;
|
||||
NRMat<double> a(n,n);
|
||||
a.randomize(1.);
|
||||
NRVec<double> v(n);
|
||||
v.randomize(1.);
|
||||
NRSMat<double> as(n);
|
||||
as.randomize(1.);
|
||||
NRVec<double>w = a*v;
|
||||
NRVec<double>ws = as*v;
|
||||
NRMat<double>c(n,n);
|
||||
c=exp(a);
|
||||
a.moveto(gpu1);
|
||||
v.moveto(gpu1);
|
||||
as.moveto(gpu1);
|
||||
NRVec<double>wgpu = a*v;
|
||||
NRVec<double>wsgpu = as*v;
|
||||
w.moveto(gpu1);
|
||||
ws.moveto(gpu1);
|
||||
cout << "Error gemv = "<<(w-wgpu).norm()<<endl;
|
||||
cout << "Error symv = "<<(ws-wsgpu).norm()<<endl;
|
||||
NRMat<double>cgpu;
|
||||
cgpu=exp(a);
|
||||
c.moveto(gpu1);
|
||||
cout << "Error = "<<(c-cgpu).norm()<<endl;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
46
vec.cc
46
vec.cc
@ -84,6 +84,16 @@ NRVec<T>::NRVec(const NRMat<T> &rhs)
|
||||
template <typename T>
|
||||
void NRVec<T>::put(int fd, bool dim, bool transp) const
|
||||
{
|
||||
#ifdef CUDALA
|
||||
if(location!=cpu)
|
||||
{
|
||||
NRVec<T> tmp= *this;
|
||||
tmp.moveto(cpu);
|
||||
tmp.put(fd,dim,transp);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
errno=0;
|
||||
int pad=1; //align at least 8-byte
|
||||
if(dim)
|
||||
@ -94,9 +104,21 @@ if(sizeof(int) != write(fd,&pad,sizeof(int))) laerror("cannot write");
|
||||
LA_traits<T>::multiput(nn,fd,v,dim);
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
void NRVec<T>::get(int fd, bool dim, bool transp)
|
||||
{
|
||||
#ifdef CUDALA
|
||||
if(location!=cpu)
|
||||
{
|
||||
NRVec<T> tmp;
|
||||
tmp.moveto(cpu);
|
||||
tmp.get(fd,dim,transp);
|
||||
tmp.moveto(location);
|
||||
*this = tmp;
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
int nn0[2]; //align at least 8-byte
|
||||
errno=0;
|
||||
if(dim)
|
||||
@ -319,9 +341,16 @@ void NRVec<double>::gemv(const double beta, const NRMat<double> &A,
|
||||
if ((trans == 'n'?A.ncols():A.nrows()) != x.size())
|
||||
laerror("incompatible sizes in gemv A*x");
|
||||
#endif
|
||||
SAME_LOC3(*this,x,A);
|
||||
copyonwrite();
|
||||
cblas_dgemv(CblasRowMajor, (trans=='n' ? CblasNoTrans:CblasTrans),
|
||||
A.nrows(), A.ncols(), alpha, A, A.ncols(), x.v, 1, beta, v, 1);
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
#endif
|
||||
cblas_dgemv(CblasRowMajor, (trans=='n' ? CblasNoTrans:CblasTrans), A.nrows(), A.ncols(), alpha, A, A.ncols(), x.v, 1, beta, v, 1);
|
||||
#ifdef CUDALA
|
||||
else
|
||||
cublasDgemv((trans=='n' ?'T':'N'),A.ncols(), A.nrows(),alpha, A, A.ncols(), x.v, 1, beta, v, 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -362,8 +391,16 @@ void NRVec<double>::gemv(const double beta, const NRSMat<double> &A,
|
||||
#ifdef DEBUG
|
||||
if (A.ncols()!=x.size()) laerror("incompatible dimension in gemv A*x");
|
||||
#endif
|
||||
SAME_LOC3(*this,A,x);
|
||||
copyonwrite();
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
#endif
|
||||
cblas_dspmv(CblasRowMajor, CblasLower, A.ncols(), alpha, A, x.v, 1, beta, v, 1);
|
||||
#ifdef CUDALA
|
||||
else
|
||||
cublasDspmv('U',A.ncols(), alpha, A, x.v, 1, beta, v, 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
template<>
|
||||
@ -420,6 +457,7 @@ NRVec<complex<double> >::otimes(const NRVec< complex<double> > &b, const bool co
|
||||
template<typename T>
|
||||
int NRVec<T>::sort(int direction, int from, int to, int *perm)
|
||||
{
|
||||
NOT_GPU(*this);
|
||||
copyonwrite();
|
||||
if(to == -1) to=nn-1;
|
||||
if(direction) return memqsort<1,NRVec<T>,int,int>(*this,perm,from,to);
|
||||
@ -427,6 +465,10 @@ else return memqsort<0,NRVec<T>,int,int>(*this,perm,from,to);
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//// forced instantization in the corespoding object file
|
||||
|
||||
|
714
vec.h
714
vec.h
@ -30,6 +30,9 @@ template <typename T> void lawritemat(FILE *file,const T *a,int r,int c,
|
||||
|
||||
// Memory allocated constants for cblas routines
|
||||
const static complex<double> CONE = 1.0, CMONE = -1.0, CZERO = 0.0;
|
||||
#ifdef CUDALA
|
||||
const static cuDoubleComplex CUONE = {1.,0.}, CUMONE = {-1.,0.}, CUZERO = {0.,0.};
|
||||
#endif
|
||||
|
||||
// Macros to construct binary operators +,-,*, from +=, -=, *=
|
||||
// for 3 cases: X + a, a + X, X + Y
|
||||
@ -44,7 +47,7 @@ template<class T> \
|
||||
|
||||
#define NRVECMAT_OPER2(E,X) \
|
||||
template<class T> \
|
||||
inline const NR##E<T> NR##E<T>::operator X(const NR##E<T> &a) const \
|
||||
inline const NR##E<T> NR##E<T>::operator X(const NR##E<T> &a) const \
|
||||
{ return NR##E(*this) X##= a; }
|
||||
|
||||
|
||||
@ -55,12 +58,32 @@ protected:
|
||||
int nn;
|
||||
T *v;
|
||||
int *count;
|
||||
#ifdef CUDALA
|
||||
GPUID location;
|
||||
#endif
|
||||
public:
|
||||
friend class NRSMat<T>;
|
||||
friend class NRMat<T>;
|
||||
|
||||
inline NRVec(): nn(0),v(0),count(0){};
|
||||
explicit inline NRVec(const int n) : nn(n), v(new T[n]), count(new int(1)) {};
|
||||
inline NRVec(): nn(0),v(0),count(0)
|
||||
{
|
||||
#ifdef CUDALA
|
||||
location = DEFAULT_LOC;
|
||||
#endif
|
||||
};
|
||||
explicit inline NRVec(const int n, const GPUID loc= undefined) : nn(n), count(new int(1))
|
||||
{
|
||||
#ifdef CUDALA
|
||||
if(loc==undefined) location = DEFAULT_LOC; else location = loc;
|
||||
if(location==cpu)
|
||||
#endif
|
||||
v= new T[n];
|
||||
#ifdef CUDALA
|
||||
else
|
||||
v= (T*) gpualloc(n*sizeof(T));
|
||||
#endif
|
||||
};
|
||||
|
||||
inline NRVec(const T &a, const int n);
|
||||
inline NRVec(const T *a, const int n);
|
||||
inline NRVec(T *a, const int n, bool skeleton);
|
||||
@ -71,6 +94,13 @@ public:
|
||||
explicit NRVec(const NRMat<T> &rhs) : NRVec(&rhs[0][0],rhs.nrows()*rhs.ncols()) {};
|
||||
#else
|
||||
explicit NRVec(const NRMat<T> &rhs);
|
||||
#endif
|
||||
#ifdef CUDALA
|
||||
inline GPUID getlocation() const {return location;}
|
||||
void moveto(const GPUID dest);
|
||||
#else
|
||||
inline GPUID getlocation() const {return cpu;}
|
||||
void moveto(const GPUID dest) {};
|
||||
#endif
|
||||
NRVec & operator=(const NRVec &rhs);
|
||||
NRVec & operator=(const T &a); //assign a to every element
|
||||
@ -103,8 +133,8 @@ public:
|
||||
void gemv(const T beta, const SparseMat<T> &a, const char trans, const T alpha, const NRVec &x,const bool treat_as_symmetric=false);
|
||||
void gemv(const typename LA_traits_complex<T>::Component_type beta, const typename LA_traits_complex<T>::NRMat_Noncomplex_type &a, const char trans, const typename LA_traits_complex<T>::Component_type alpha, const NRVec &x);
|
||||
void gemv(const typename LA_traits_complex<T>::Component_type beta, const typename LA_traits_complex<T>::NRSMat_Noncomplex_type &a, const char trans, const typename LA_traits_complex<T>::Component_type alpha, const NRVec &x);
|
||||
const NRVec operator*(const NRMat<T> &mat) const {NRVec<T> result(mat.ncols()); result.gemv((T)0,mat,'t',(T)1,*this); return result;};
|
||||
const NRVec operator*(const NRSMat<T> &mat) const {NRVec<T> result(mat.ncols()); result.gemv((T)0,mat,'t',(T)1,*this); return result;};
|
||||
const NRVec operator*(const NRMat<T> &mat) const {NRVec<T> result(mat.ncols(),mat.getlocation()); result.gemv((T)0,mat,'t',(T)1,*this); return result;};
|
||||
const NRVec operator*(const NRSMat<T> &mat) const {NRVec<T> result(mat.ncols(),mat.getlocation()); result.gemv((T)0,mat,'t',(T)1,*this); return result;};
|
||||
const NRVec operator*(const SparseMat<T> &mat) const {NRVec<T> result(mat.ncols()); result.gemv((T)0,mat,'t',(T)1,*this); return result;};
|
||||
const NRMat<T> otimes(const NRVec<T> &rhs, const bool conjugate=false, const T &scale=1) const; //outer product
|
||||
inline const NRMat<T> operator|(const NRVec<T> &rhs) const {return otimes(rhs,true);};
|
||||
@ -150,29 +180,58 @@ public:
|
||||
#include "sparsemat.h"
|
||||
#include "sparsesmat.h"
|
||||
|
||||
|
||||
|
||||
namespace LA {
|
||||
// formatted I/O
|
||||
template <typename T>
|
||||
std::ostream & operator<<(std::ostream &s, const NRVec<T> &x)
|
||||
{
|
||||
#ifdef CUDALA
|
||||
if(x.getlocation()==cpu)
|
||||
{
|
||||
#endif
|
||||
int i, n;
|
||||
|
||||
n = x.size();
|
||||
s << n << std::endl;
|
||||
for(i=0; i<n; i++) s << (typename LA_traits_io<T>::IOtype)x[i] << (i == n-1 ? '\n' : ' ');
|
||||
return s;
|
||||
#ifdef CUDALA
|
||||
}
|
||||
else
|
||||
{
|
||||
NRVec<T> tmp=x;
|
||||
tmp.moveto(cpu);
|
||||
return s<<tmp;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::istream & operator>>(std::istream &s, NRVec<T> &x)
|
||||
{
|
||||
#ifdef CUDALA
|
||||
if(x.getlocation()==cpu)
|
||||
{
|
||||
#endif
|
||||
int i,n;
|
||||
|
||||
s >> n;
|
||||
x.resize(n);
|
||||
typename LA_traits_io<T>::IOtype tmp;
|
||||
for(i=0; i<n; i++) {s >> tmp; x[i]=tmp;}
|
||||
return s;
|
||||
#ifdef CUDALA
|
||||
}
|
||||
else
|
||||
{
|
||||
NRVec<T> tmp;
|
||||
tmp.moveto(cpu);
|
||||
s >> tmp;
|
||||
tmp.moveto(x.getlocation());
|
||||
x=tmp;
|
||||
return s;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -180,22 +239,51 @@ std::istream & operator>>(std::istream &s, NRVec<T> &x)
|
||||
|
||||
// ctors
|
||||
template <typename T>
|
||||
inline NRVec<T>::NRVec(const T& a, const int n) : nn(n), v(new T[n]), count(new int)
|
||||
inline NRVec<T>::NRVec(const T& a, const int n) : nn(n), count(new int)
|
||||
{
|
||||
*count = 1;
|
||||
#ifdef CUDALA
|
||||
location=DEFAULT_LOC;
|
||||
if(location==cpu)
|
||||
{
|
||||
#endif
|
||||
v = new T[n];
|
||||
if(a != (T)0)
|
||||
for(int i=0; i<n; i++)
|
||||
v[i] = a;
|
||||
else
|
||||
memset(v, 0, nn*sizeof(T));
|
||||
#ifdef CUDALA
|
||||
}
|
||||
else
|
||||
{
|
||||
v= (T*) gpualloc(n*sizeof(T));
|
||||
cublasSetVector(n,sizeof(T),&a,0,v,1);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
template <typename T>
|
||||
inline NRVec<T>::NRVec(const T *a, const int n) : nn(n), count(new int)
|
||||
{
|
||||
v=new T[n];
|
||||
*count = 1;
|
||||
memcpy(v, a, n*sizeof(T));
|
||||
#ifdef CUDALA
|
||||
location=DEFAULT_LOC;
|
||||
if(location==cpu)
|
||||
{
|
||||
#endif
|
||||
v=new T[n];
|
||||
*count = 1;
|
||||
memcpy(v, a, n*sizeof(T));
|
||||
#ifdef CUDALA
|
||||
}
|
||||
else
|
||||
{
|
||||
v= (T*) gpualloc(n*sizeof(T));
|
||||
cublasSetVector(n,sizeof(T),a,1,v,1);
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
@ -203,12 +291,28 @@ inline NRVec<T>::NRVec(T *a, const int n, bool skeleton) : nn(n), count(new int)
|
||||
{
|
||||
if(!skeleton)
|
||||
{
|
||||
#ifdef CUDALA
|
||||
location=DEFAULT_LOC;
|
||||
if(location==cpu)
|
||||
{
|
||||
#endif
|
||||
v=new T[n];
|
||||
*count = 1;
|
||||
memcpy(v, a, n*sizeof(T));
|
||||
#ifdef CUDALA
|
||||
}
|
||||
else
|
||||
{
|
||||
v= (T*) gpualloc(n*sizeof(T));
|
||||
cublasSetVector(n,sizeof(T),a,1,v,1);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
#ifdef CUDALA
|
||||
if(location!=cpu) laerror("NRVec() with skeleton option cannot be on GPU");
|
||||
#endif
|
||||
*count = 2;
|
||||
v=a;
|
||||
}
|
||||
@ -217,6 +321,9 @@ inline NRVec<T>::NRVec(T *a, const int n, bool skeleton) : nn(n), count(new int)
|
||||
template <typename T>
|
||||
inline NRVec<T>::NRVec(const NRVec<T> &rhs)
|
||||
{
|
||||
#ifdef CUDALA
|
||||
location=rhs.location;
|
||||
#endif
|
||||
v = rhs.v;
|
||||
nn = rhs.nn;
|
||||
count = rhs.count;
|
||||
@ -226,6 +333,9 @@ inline NRVec<T>::NRVec(const NRVec<T> &rhs)
|
||||
template <typename T>
|
||||
inline NRVec<T>::NRVec(const NRSMat<T> &rhs)
|
||||
{
|
||||
#ifdef CUDALA
|
||||
location=rhs.location;
|
||||
#endif
|
||||
nn = rhs.nn;
|
||||
nn = NN2;
|
||||
v = rhs.v;
|
||||
@ -233,28 +343,11 @@ inline NRVec<T>::NRVec(const NRSMat<T> &rhs)
|
||||
(*count)++;
|
||||
}
|
||||
|
||||
// x += a
|
||||
template<>
|
||||
inline NRVec<double> & NRVec<double>::operator+=(const double &a)
|
||||
{
|
||||
copyonwrite();
|
||||
cblas_daxpy(nn, 1.0, &a, 0, v, 1);
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<>
|
||||
inline NRVec< complex<double> > &
|
||||
NRVec< complex<double> >::operator+=(const complex<double> &a)
|
||||
{
|
||||
copyonwrite();
|
||||
cblas_zaxpy(nn, &CONE, &a, 0, v, 1);
|
||||
return *this;
|
||||
}
|
||||
|
||||
//and for general type
|
||||
// x +/-= a
|
||||
template <typename T>
|
||||
inline NRVec<T> & NRVec<T>::operator+=(const T &a)
|
||||
{
|
||||
NOT_GPU(*this);
|
||||
copyonwrite();
|
||||
int i;
|
||||
for(i=0; i<nn; ++i) v[i]+=a;
|
||||
@ -262,65 +355,26 @@ inline NRVec<T> & NRVec<T>::operator+=(const T &a)
|
||||
}
|
||||
|
||||
|
||||
// x -= a
|
||||
template<>
|
||||
inline NRVec<double> & NRVec<double>::operator-=(const double &a)
|
||||
{
|
||||
copyonwrite();
|
||||
cblas_daxpy(nn, -1.0, &a, 0, v, 1);
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<>
|
||||
inline NRVec< complex<double> > &
|
||||
NRVec< complex<double> >::operator-=(const complex<double> &a)
|
||||
{
|
||||
copyonwrite();
|
||||
cblas_zaxpy(nn, &CMONE, &a, 0, v, 1);
|
||||
return *this;
|
||||
}
|
||||
|
||||
//and for general type
|
||||
template <typename T>
|
||||
inline NRVec<T> & NRVec<T>::operator-=(const T &a)
|
||||
{
|
||||
NOT_GPU(*this);
|
||||
copyonwrite();
|
||||
int i;
|
||||
for(i=0; i<nn; ++i) v[i]-=a;
|
||||
int i;
|
||||
for(i=0; i<nn; ++i) v[i]-=a;
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// x += x
|
||||
template<>
|
||||
inline NRVec<double> & NRVec<double>::operator+=(const NRVec<double> &rhs)
|
||||
{
|
||||
#ifdef DEBUG
|
||||
if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
|
||||
#endif
|
||||
copyonwrite();
|
||||
cblas_daxpy(nn, 1.0, rhs.v, 1, v, 1);
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<>
|
||||
inline NRVec< complex<double> > &
|
||||
NRVec< complex<double> >::operator+=(const NRVec< complex<double> > &rhs)
|
||||
{
|
||||
#ifdef DEBUG
|
||||
if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
|
||||
#endif
|
||||
copyonwrite();
|
||||
cblas_zaxpy(nn, &CONE, rhs.v, 1, v, 1);
|
||||
return *this;
|
||||
}
|
||||
|
||||
//and for general type
|
||||
template <typename T>
|
||||
inline NRVec<T> & NRVec<T>::operator+=(const NRVec<T> &rhs)
|
||||
{
|
||||
#ifdef DEBUG
|
||||
if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
|
||||
NOT_GPU(*this);
|
||||
NOT_GPU(rhs);
|
||||
#endif
|
||||
copyonwrite();
|
||||
int i;
|
||||
@ -346,6 +400,8 @@ inline NRVec<T> & NRVec<T>::operator/=(const NRVec<T> &rhs)
|
||||
{
|
||||
#ifdef DEBUG
|
||||
if (nn != rhs.nn) laerror("/= of incompatible vectors");
|
||||
NOT_GPU(*this);
|
||||
NOT_GPU(rhs);
|
||||
#endif
|
||||
copyonwrite();
|
||||
int i;
|
||||
@ -356,35 +412,13 @@ inline NRVec<T> & NRVec<T>::operator/=(const NRVec<T> &rhs)
|
||||
|
||||
|
||||
// x -= x
|
||||
template<>
|
||||
inline NRVec<double> & NRVec<double>::operator-=(const NRVec<double> &rhs)
|
||||
{
|
||||
#ifdef DEBUG
|
||||
if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
|
||||
#endif
|
||||
copyonwrite();
|
||||
cblas_daxpy(nn, -1.0, rhs.v, 1, v, 1);
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<>
|
||||
inline NRVec< complex<double> > &
|
||||
NRVec< complex<double> >::operator-=(const NRVec< complex<double> > &rhs)
|
||||
{
|
||||
#ifdef DEBUG
|
||||
if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
|
||||
#endif
|
||||
copyonwrite();
|
||||
cblas_zaxpy(nn, &CMONE, rhs.v, 1, v, 1);
|
||||
return *this;
|
||||
}
|
||||
|
||||
//and for general type
|
||||
template <typename T>
|
||||
inline NRVec<T> & NRVec<T>::operator-=(const NRVec<T> &rhs)
|
||||
{
|
||||
#ifdef DEBUG
|
||||
if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
|
||||
NOT_GPU(*this);
|
||||
NOT_GPU(rhs);
|
||||
#endif
|
||||
copyonwrite();
|
||||
int i;
|
||||
@ -394,27 +428,10 @@ inline NRVec<T> & NRVec<T>::operator-=(const NRVec<T> &rhs)
|
||||
|
||||
|
||||
// x *= a
|
||||
template<>
|
||||
inline NRVec<double> & NRVec<double>::operator*=(const double &a)
|
||||
{
|
||||
copyonwrite();
|
||||
cblas_dscal(nn, a, v, 1);
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<>
|
||||
inline NRVec< complex<double> > &
|
||||
NRVec< complex<double> >::operator*=(const complex<double> &a)
|
||||
{
|
||||
copyonwrite();
|
||||
cblas_zscal(nn, &a, v, 1);
|
||||
return *this;
|
||||
}
|
||||
|
||||
//and for general type
|
||||
template <typename T>
|
||||
inline NRVec<T> & NRVec<T>::operator*=(const T &a)
|
||||
{
|
||||
NOT_GPU(*this);
|
||||
copyonwrite();
|
||||
int i;
|
||||
for(i=0; i<nn; ++i) v[i]*=a;
|
||||
@ -423,33 +440,13 @@ inline NRVec<T> & NRVec<T>::operator*=(const T &a)
|
||||
|
||||
|
||||
// scalar product x.y
|
||||
template<>
|
||||
inline const double NRVec<double>::operator*(const NRVec<double> &rhs) const
|
||||
{
|
||||
#ifdef DEBUG
|
||||
if (nn != rhs.nn) laerror("dot of incompatible vectors");
|
||||
#endif
|
||||
return cblas_ddot(nn, v, 1, rhs.v, 1);
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
inline const complex<double>
|
||||
NRVec< complex<double> >::operator*(const NRVec< complex<double> > &rhs) const
|
||||
{
|
||||
#ifdef DEBUG
|
||||
if (nn != rhs.nn) laerror("dot of incompatible vectors");
|
||||
#endif
|
||||
complex<double> dot;
|
||||
cblas_zdotc_sub(nn, v, 1, rhs.v, 1, &dot);
|
||||
return dot;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
inline const T NRVec<T>::operator*(const NRVec<T> &rhs) const
|
||||
{
|
||||
#ifdef DEBUG
|
||||
if (nn != rhs.nn) laerror("dot of incompatible vectors");
|
||||
NOT_GPU(*this);
|
||||
NOT_GPU(rhs);
|
||||
#endif
|
||||
T dot = 0;
|
||||
for(int i=0; i<nn; ++i) dot+= v[i]*rhs.v[i];
|
||||
@ -458,28 +455,6 @@ inline const T NRVec<T>::operator*(const NRVec<T> &rhs) const
|
||||
|
||||
|
||||
|
||||
// Sum of elements
|
||||
template<>
|
||||
inline const double NRVec<double>::asum() const
|
||||
{
|
||||
return cblas_dasum(nn, v, 1);
|
||||
}
|
||||
|
||||
|
||||
// Dot product: x * y
|
||||
template<>
|
||||
inline const double NRVec<double>::dot(const double *y, const int stride) const
|
||||
{
|
||||
return cblas_ddot(nn, y, stride, v, 1);
|
||||
}
|
||||
template<>
|
||||
inline const complex<double>
|
||||
NRVec< complex<double> >::dot(const complex<double> *y, const int stride) const
|
||||
{
|
||||
complex<double> dot;
|
||||
cblas_zdotc_sub(nn, y, stride, v, 1, &dot);
|
||||
return dot;
|
||||
}
|
||||
|
||||
// x[i] returns i-th element
|
||||
template <typename T>
|
||||
@ -489,6 +464,7 @@ inline T & NRVec<T>::operator[](const int i)
|
||||
if(_LA_count_check && *count != 1) laerror("possible lval [] with count > 1");
|
||||
if(i < 0 || i >= nn) laerror("NRVec out of range");
|
||||
if(!v) laerror("[] on unallocated NRVec");
|
||||
NOT_GPU(*this);
|
||||
#endif
|
||||
return v[i];
|
||||
}
|
||||
@ -498,6 +474,7 @@ inline const T & NRVec<T>::operator[](const int i) const
|
||||
#ifdef DEBUG
|
||||
if(i < 0 || i >= nn) laerror("NRVec out of range");
|
||||
if(!v) laerror("[] on unallocated NRVec");
|
||||
NOT_GPU(*this);
|
||||
#endif
|
||||
return v[i];
|
||||
}
|
||||
@ -527,29 +504,6 @@ inline NRVec<T>::operator const T*() const
|
||||
return v;
|
||||
}
|
||||
|
||||
// return norm of the Vec
|
||||
template<>
|
||||
inline const double NRVec<double>::norm() const
|
||||
{
|
||||
return cblas_dnrm2(nn, v, 1);
|
||||
}
|
||||
template<>
|
||||
inline const double NRVec< complex<double> >::norm() const
|
||||
{
|
||||
return cblas_dznrm2(nn, v, 1);
|
||||
}
|
||||
|
||||
// Max element of the array
|
||||
template<>
|
||||
inline const double NRVec<double>::amax() const
|
||||
{
|
||||
return v[cblas_idamax(nn, v, 1)];
|
||||
}
|
||||
template<>
|
||||
inline const complex<double> NRVec< complex<double> >::amax() const
|
||||
{
|
||||
return v[cblas_izamax(nn, v, 1)];
|
||||
}
|
||||
|
||||
|
||||
// Make Vec unitvector
|
||||
@ -576,7 +530,16 @@ NRVec<T>::~NRVec()
|
||||
{
|
||||
if(!count) return;
|
||||
if(--(*count) <= 0) {
|
||||
if(v) delete[] (v);
|
||||
if(v)
|
||||
{
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
#endif
|
||||
delete[] (v);
|
||||
#ifdef CUDALA
|
||||
else gpufree(v);
|
||||
#endif
|
||||
}
|
||||
delete count;
|
||||
}
|
||||
}
|
||||
@ -591,12 +554,29 @@ void NRVec<T>::copyonwrite()
|
||||
(*count)--;
|
||||
count = new int;
|
||||
*count = 1;
|
||||
T *newv = new T[nn];
|
||||
memcpy(newv, v, nn*sizeof(T));
|
||||
T *newv;
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
{
|
||||
#endif
|
||||
newv = new T[nn];
|
||||
memcpy(newv, v, nn*sizeof(T));
|
||||
#ifdef CUDALA
|
||||
}
|
||||
else
|
||||
{
|
||||
newv = (T *) gpualloc(nn*sizeof(T));
|
||||
if(sizeof(T)%sizeof(float)!=0) laerror("cpu memcpy alignment problem");
|
||||
cublasScopy(nn*sizeof(T)/sizeof(float),(const float *) v,1,(float *)newv,1);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
v = newv;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Asignment
|
||||
template <typename T>
|
||||
NRVec<T> & NRVec<T>::operator=(const NRVec<T> &rhs)
|
||||
@ -606,17 +586,29 @@ NRVec<T> & NRVec<T>::operator=(const NRVec<T> &rhs)
|
||||
if(count)
|
||||
if(--(*count) == 0)
|
||||
{
|
||||
delete[] v;
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
#endif
|
||||
delete[] v;
|
||||
#ifdef CUDALA
|
||||
else
|
||||
gpufree(v);
|
||||
#endif
|
||||
delete count;
|
||||
}
|
||||
v = rhs.v;
|
||||
nn = rhs.nn;
|
||||
count = rhs.count;
|
||||
#ifdef CUDALA
|
||||
location=rhs.location;
|
||||
#endif
|
||||
if(count) (*count)++;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Resize
|
||||
template <typename T>
|
||||
void NRVec<T>::resize(const int n)
|
||||
@ -629,7 +621,17 @@ void NRVec<T>::resize(const int n)
|
||||
if(n==0)
|
||||
{
|
||||
if(--(*count) <= 0) {
|
||||
if(v) delete[] (v);
|
||||
if(v)
|
||||
{
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
#endif
|
||||
delete[] (v);
|
||||
#ifdef CUDALA
|
||||
else
|
||||
gpufree(v);
|
||||
#endif
|
||||
}
|
||||
delete count;
|
||||
}
|
||||
count=0;
|
||||
@ -648,14 +650,33 @@ void NRVec<T>::resize(const int n)
|
||||
count = new int;
|
||||
*count = 1;
|
||||
nn = n;
|
||||
v = new T[nn];
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
#endif
|
||||
v = new T[nn];
|
||||
#ifdef CUDALA
|
||||
else
|
||||
v = (T*) gpualloc(nn*sizeof(T));
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
// *count = 1 in this branch
|
||||
if (n != nn) {
|
||||
nn = n;
|
||||
delete[] v;
|
||||
v = new T[nn];
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
#endif
|
||||
{
|
||||
delete[] v;
|
||||
v = new T[nn];
|
||||
}
|
||||
#ifdef CUDALA
|
||||
else
|
||||
{
|
||||
gpufree(v);
|
||||
v = (T*) gpualloc(nn*sizeof(T));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@ -664,30 +685,18 @@ void NRVec<T>::resize(const int n)
|
||||
template <typename T>
|
||||
NRVec<T> & NRVec<T>::operator|=(const NRVec<T> &rhs)
|
||||
{
|
||||
if (this != &rhs) {
|
||||
#ifdef DEBUG
|
||||
if (!rhs.v) laerror("unallocated rhs in NRVec operator |=");
|
||||
#endif
|
||||
if (count)
|
||||
if (*count > 1) {
|
||||
--(*count);
|
||||
nn = 0;
|
||||
count = 0;
|
||||
v = 0;
|
||||
}
|
||||
if (nn != rhs.nn) {
|
||||
if (v) delete[] (v);
|
||||
nn = rhs.nn;
|
||||
}
|
||||
if(!v) v = new T[nn];
|
||||
if(!count) count = new int;
|
||||
*count = 1;
|
||||
memcpy(v, rhs.v, nn*sizeof(T));
|
||||
}
|
||||
return *this;
|
||||
if (this == &rhs) return *this;
|
||||
*this = rhs;
|
||||
this->copyonwrite();
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
template<typename T>
|
||||
NRVec<complex<T> > complexify(const NRVec<T> &rhs)
|
||||
{
|
||||
@ -696,6 +705,291 @@ for(int i=0; i<rhs.size(); ++i) r[i]=rhs[i];
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
#ifdef CUDALA
|
||||
template<typename T>
|
||||
void NRVec<T>::moveto(const GPUID dest)
|
||||
{
|
||||
if(location==dest) return;
|
||||
location=dest;
|
||||
|
||||
if(v && !count) laerror("internal inconsistency of reference counting 1");
|
||||
if (!count) return;
|
||||
|
||||
if(v && *count==0) laerror("internal inconsistency of reference counting 2");
|
||||
if(!v) return;
|
||||
|
||||
T *vold = v;
|
||||
|
||||
if(dest == cpu) //moving from GPU to CPU
|
||||
{
|
||||
v = new T[nn];
|
||||
gpuget(nn,sizeof(T),vold,v);
|
||||
if(*count == 1) gpufree(vold);
|
||||
else {--(*count); count = new int(1);}
|
||||
}
|
||||
else //moving from CPU to GPU
|
||||
{
|
||||
v=(T *) gpualloc(nn*sizeof(T));
|
||||
gpuput(nn,sizeof(T),vold,v);
|
||||
if(*count == 1) delete[] vold;
|
||||
else {--(*count); count = new int(1);}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
//some template specializations leading to BLAS/CUBLAS calls
|
||||
template<>
|
||||
inline
|
||||
NRVec<double> & NRVec<double>::operator+=(const double &a)
|
||||
{
|
||||
copyonwrite();
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
#endif
|
||||
cblas_daxpy(nn, 1.0, &a, 0, v, 1);
|
||||
#ifdef CUDALA
|
||||
else
|
||||
{
|
||||
double *d=gpuputdouble(a);
|
||||
cublasDaxpy(nn, 1.0, d, 0, v, 1);
|
||||
gpufree(d);
|
||||
}
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<>
|
||||
inline
|
||||
NRVec< complex<double> > &
|
||||
NRVec< complex<double> >::operator+=(const complex<double> &a)
|
||||
{
|
||||
copyonwrite();
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
#endif
|
||||
cblas_zaxpy(nn, &CONE, &a, 0, v, 1);
|
||||
#ifdef CUDALA
|
||||
else
|
||||
{
|
||||
complex<double> *d=gpuputcomplex(a);
|
||||
cublasZaxpy(nn, CUONE, (cuDoubleComplex *)d, 0, (cuDoubleComplex *)v, 1);
|
||||
gpufree(d);
|
||||
}
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<>
|
||||
inline
|
||||
NRVec<double> & NRVec<double>::operator-=(const double &a)
|
||||
{
|
||||
copyonwrite();
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
#endif
|
||||
cblas_daxpy(nn, -1.0, &a, 0, v, 1);
|
||||
#ifdef CUDALA
|
||||
else
|
||||
{
|
||||
double *d=gpuputdouble(a);
|
||||
cublasDaxpy(nn, -1.0, d, 0, v, 1);
|
||||
gpufree(d);
|
||||
}
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<>
|
||||
inline
|
||||
NRVec< complex<double> > &
|
||||
NRVec< complex<double> >::operator-=(const complex<double> &a)
|
||||
{
|
||||
copyonwrite();
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
#endif
|
||||
cblas_zaxpy(nn, &CMONE, &a, 0, v, 1);
|
||||
#ifdef CUDALA
|
||||
else
|
||||
{
|
||||
complex<double> *d=gpuputcomplex(a);
|
||||
cublasZaxpy(nn, CUMONE, (cuDoubleComplex *)d, 0, (cuDoubleComplex *)v, 1);
|
||||
gpufree(d);
|
||||
}
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
inline
|
||||
NRVec<double> & NRVec<double>::operator+=(const NRVec<double> &rhs)
|
||||
{
|
||||
#ifdef DEBUG
|
||||
if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
|
||||
#endif
|
||||
copyonwrite();
|
||||
cblas_daxpy(nn, 1.0, rhs.v, 1, v, 1);
|
||||
return *this;
|
||||
}
|
||||
template<>
|
||||
inline
|
||||
NRVec< complex<double> > &
|
||||
NRVec< complex<double> >::operator+=(const NRVec< complex<double> > &rhs)
|
||||
{
|
||||
#ifdef DEBUG
|
||||
if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
|
||||
#endif
|
||||
copyonwrite();
|
||||
cblas_zaxpy(nn, &CONE, rhs.v, 1, v, 1);
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
inline
|
||||
NRVec<double> & NRVec<double>::operator-=(const NRVec<double> &rhs)
|
||||
{
|
||||
#ifdef DEBUG
|
||||
if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
|
||||
#endif
|
||||
SAME_LOC(*this,rhs);
|
||||
copyonwrite();
|
||||
#ifdef CUDALA
|
||||
if(location==cpu)
|
||||
#endif
|
||||
cblas_daxpy(nn, -1.0, rhs.v, 1, v, 1);
|
||||
#ifdef CUDALA
|
||||
else
|
||||
cublasDaxpy(nn, -1.0, rhs.v, 1, v, 1);
|
||||
#endif
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<>
|
||||
inline
|
||||
NRVec< complex<double> > &
|
||||
NRVec< complex<double> >::operator-=(const NRVec< complex<double> > &rhs)
|
||||
{
|
||||
#ifdef DEBUG
|
||||
if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
|
||||
#endif
|
||||
copyonwrite();
|
||||
cblas_zaxpy(nn, &CMONE, rhs.v, 1, v, 1);
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<>
|
||||
inline
|
||||
NRVec<double> & NRVec<double>::operator*=(const double &a)
|
||||
{
|
||||
copyonwrite();
|
||||
cblas_dscal(nn, a, v, 1);
|
||||
return *this;
|
||||
}
|
||||
|
||||
template<>
|
||||
inline
|
||||
NRVec< complex<double> > &
|
||||
NRVec< complex<double> >::operator*=(const complex<double> &a)
|
||||
{
|
||||
copyonwrite();
|
||||
cblas_zscal(nn, &a, v, 1);
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
inline
|
||||
const double NRVec<double>::operator*(const NRVec<double> &rhs) const
|
||||
{
|
||||
#ifdef DEBUG
|
||||
if (nn != rhs.nn) laerror("dot of incompatible vectors");
|
||||
#endif
|
||||
return cblas_ddot(nn, v, 1, rhs.v, 1);
|
||||
}
|
||||
|
||||
|
||||
template<>
|
||||
inline
|
||||
const complex<double>
|
||||
NRVec< complex<double> >::operator*(const NRVec< complex<double> > &rhs) const
|
||||
{
|
||||
#ifdef DEBUG
|
||||
if (nn != rhs.nn) laerror("dot of incompatible vectors");
|
||||
#endif
|
||||
complex<double> dot;
|
||||
cblas_zdotc_sub(nn, v, 1, rhs.v, 1, &dot);
|
||||
return dot;
|
||||
}
|
||||
|
||||
// Sum of elements
|
||||
template<>
|
||||
inline
|
||||
const double NRVec<double>::asum() const
|
||||
{
|
||||
return cblas_dasum(nn, v, 1);
|
||||
}
|
||||
|
||||
|
||||
// Dot product: x * y
|
||||
template<>
|
||||
inline
|
||||
const double NRVec<double>::dot(const double *y, const int stride) const
|
||||
{
|
||||
return cblas_ddot(nn, y, stride, v, 1);
|
||||
}
|
||||
|
||||
template<>
|
||||
inline
|
||||
const complex<double>
|
||||
NRVec< complex<double> >::dot(const complex<double> *y, const int stride) const
|
||||
{
|
||||
complex<double> dot;
|
||||
cblas_zdotc_sub(nn, y, stride, v, 1, &dot);
|
||||
return dot;
|
||||
}
|
||||
|
||||
// return norm of the Vec
|
||||
template<>
|
||||
inline
|
||||
const double NRVec<double>::norm() const
|
||||
{
|
||||
#ifdef CUDALA
|
||||
if(location!=cpu) return cublasDnrm2(nn, v, 1);
|
||||
#endif
|
||||
return cblas_dnrm2(nn, v, 1);
|
||||
}
|
||||
|
||||
template<>
|
||||
inline
|
||||
const double NRVec< complex<double> >::norm() const
|
||||
{
|
||||
return cblas_dznrm2(nn, v, 1);
|
||||
}
|
||||
|
||||
// Max element of the array
|
||||
template<>
|
||||
inline
|
||||
const double NRVec<double>::amax() const
|
||||
{
|
||||
return v[cblas_idamax(nn, v, 1)];
|
||||
}
|
||||
|
||||
/*
|
||||
cblas_izamax seems to be missing at least in some cblas versions
|
||||
template<>
|
||||
inline
|
||||
const complex<double> NRVec< complex<double> >::amax() const
|
||||
{
|
||||
return v[cblas_izamax(nn, v, 1)];
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
|
||||
}//namespace
|
||||
|
||||
#endif /* _LA_VEC_H_ */
|
||||
|
Loading…
Reference in New Issue
Block a user