This commit is contained in:
jiri 2010-09-08 16:27:58 +00:00
parent e8cbf9e5fb
commit e580467e5a
14 changed files with 7106 additions and 3691 deletions

View File

@ -1,3 +1,6 @@
/* vim: set ts=8 sw=8 sts=8 noexpandtab cindent: */
/*******************************************************************************
*******************************************************************************/
#include "la_traits.h"
#include "cuda_la.h"
@ -7,63 +10,54 @@ namespace LA {
GPUID DEFAULT_LOC = cpu;
void set_default_loc(const GPUID loc)
{
DEFAULT_LOC = loc;
void set_default_loc(const GPUID loc){
DEFAULT_LOC = loc;
}
void *gpualloc(size_t size)
{
cublasStatus status;
void *ptr=NULL;
status = cublasAlloc(size,1,&ptr);
if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc");
return ptr;
void *gpualloc(size_t size){
void *ptr = NULL;
cublasAlloc(size, 1, &ptr);
TEST_CUBLAS("cublasAlloc");
return ptr;
}
void gpufree(void *ptr)
{
cublasStatus status = cublasFree(ptr);
if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasFree");
void gpufree(void *ptr){
cublasFree(ptr);
TEST_CUBLAS("cublasFree");
}
void gpuget(size_t n, size_t elsize, const void *from, void *to)
{
cublasStatus status;
status=cublasGetVector(n,elsize,from,1,to,1);
if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasGetVector");
void gpuget(size_t n, size_t elsize, const void *from, void *to){
cublasGetVector(n, elsize, from, 1, to, 1);
TEST_CUBLAS("cublasGetVector");
}
void gpuput(size_t n, size_t elsize, const void *from, void *to)
{
cublasStatus status;
status=cublasSetVector(n,elsize,from,1,to,1);
if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector");
void gpuput(size_t n, size_t elsize, const void *from, void *to){
cublasSetVector(n, elsize, from, 1, to, 1);
TEST_CUBLAS("cublasSetVector");
}
double *gpuputdouble(const double &x)
{
cublasStatus status;
void *ptr=NULL;
status = cublasAlloc(1,sizeof(double),&ptr);
if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc");
status=cublasSetVector(1,sizeof(double),&x,1,ptr,1);
if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector");
return (double *)ptr;
double *gpuputdouble(const double &x){
void *ptr = NULL;
cublasAlloc(1, sizeof(double), &ptr);
TEST_CUBLAS("cublasAlloc");
cublasSetVector(1, sizeof(double), &x, 1, ptr, 1);
TEST_CUBLAS("cublasSetVector");
return (double *)ptr;
}
complex<double> *gpuputcomplex(const complex<double> &x)
{
cublasStatus status;
void *ptr=NULL;
status = cublasAlloc(1,sizeof(complex<double>),&ptr);
if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc");
status=cublasSetVector(1,sizeof(complex<double>),&x,1,ptr,1);
if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector");
return (complex<double> *)ptr;
}
complex<double> *gpuputcomplex(const complex<double> &x){
void *ptr = NULL;
cublasAlloc(1, sizeof(complex<double>), &ptr);
TEST_CUBLAS("cublasAlloc");
cublasSetVector(1, sizeof(complex<double>), &x, 1, ptr, 1);
TEST_CUBLAS("cublasSetVector");
return (complex<double> *)ptr;
}
}

View File

@ -1,6 +1,10 @@
//------------------------------------------------------------------------------
/* vim: set ts=8 sw=8 sts=8 noexpandtab cindent: */
//------------------------------------------------------------------------------
#ifndef _CUDA_LA_H
#define _CUDA_LA_H
#include <errno.h>
#ifdef CUDALA
#undef MATPTR
#include "cublas.h"
@ -13,6 +17,7 @@ namespace LA {
#ifdef CUDALA
#define CPU_GPU(x,y) {if((x)!=cpu && (y)!=cpu) laerror("one operand must be in CPU memory");}
#define NOT_GPU(x) {if((x).getlocation()!=cpu) laerror("Operation not implemented on GPU (yet). Use .moveto(0) first.");}
#define NOT_CPU(x) {if((x).getlocation()==cpu) laerror("Operation not implemented on CPU (yet). Use .moveto(>0) first.");}
#define SAME_LOC(x,y) {if((x).getlocation()!=(y).getlocation()) laerror("Operands have different location. Use .moveto() first.");}
#define SAME_LOC3(x,y,z) {if((x).getlocation()!=(y).getlocation() || (x).getlocation()!=(z).getlocation()) laerror("Operands have different location. Use .moveto() first.");}
#else
@ -22,6 +27,16 @@ namespace LA {
#define SAME_LOC3(x,y,z) {}
#endif
#ifdef DEBUG
#ifdef __GNUG__
#define TEST_CUBLAS(X) { if(cublasGetError() != CUBLAS_STATUS_SUCCESS){ laerror2(#X, __PRETTY_FUNCTION__); } }
#else
#define TEST_CUBLAS(X) { if(cublasGetError() != CUBLAS_STATUS_SUCCESS){ laerror2(#X, __func__); } }
#endif
#else
#define TEST_CUBLAS(X) {}
#endif
typedef enum {undefined=-1, cpu=0, gpu1=1, gpu2=2, gpu3=3, gpu4=4} GPUID;
#ifdef CUDALA
@ -33,6 +48,7 @@ public:
{
cublasStatus status = cublasInit();
if (status != CUBLAS_STATUS_SUCCESS) laerror("Cannot init GPU for CUBLAS");
errno = 0;
}
~GPU_START(void)
{
@ -50,8 +66,41 @@ extern complex<double> *gpuputcomplex(const complex<double> &x);
void set_default_loc(const GPUID loc);
extern GPUID DEFAULT_LOC;
template <typename T>
void smart_gpu_set(size_t n, const T& val, void *gpu_to, size_t _step = 1){
void *ptr(NULL);
if(sizeof(T)%sizeof(float) != 0){ laerror("memory alignment error"); }
cublasAlloc(1, sizeof(T), &ptr);
TEST_CUBLAS("cublasAlloc");
cublasSetVector(1, sizeof(T), &val, 1, ptr, 1);
TEST_CUBLAS("cublasSetVector");
if(sizeof(T) == sizeof(float)){
cublasScopy(n, (float*)ptr, 0, ((float*)gpu_to), _step);
TEST_CUBLAS("cublasScopy");
}else if(sizeof(T) == sizeof(double)){
cublasDcopy(n, (double*)ptr, 0, ((double*)gpu_to), _step);
TEST_CUBLAS("cublasDcopy");
}else if(sizeof(T) == sizeof(complex<double>)){
cublasZcopy(n, (cuDoubleComplex*)ptr, 0, (cuDoubleComplex*)gpu_to, _step);
TEST_CUBLAS("cublasZcopy");
}else{
for(register int i=0; i<sizeof(T)/sizeof(float); i++){
cublasScopy(n, (float*)ptr + i, 0, ((float*)gpu_to) + i, sizeof(T)/sizeof(float)*_step);
TEST_CUBLAS("cublasScopy");
}
}
cublasFree(ptr);
TEST_CUBLAS("cublasFree");
}
extern GPUID DEFAULT_LOC;
#endif
}

9
la.h
View File

@ -44,5 +44,12 @@
#include "sparsesmat.h"
#include "vec.h"
using namespace LA;
typedef NRMat<int> NRIMat;
typedef NRMat<double> NRDMat;
typedef NRMat<complex<double> > NRCMat;
typedef NRVec<int> NRIVec;
typedef NRVec<double> NRDVec;
typedef NRVec<complex<double> > NRCVec;
#endif /* _LA_H_ */

View File

@ -32,6 +32,7 @@
#include <stdio.h>
#include <string.h>
#include <iostream>
#include <limits>
#include <complex>

View File

@ -42,20 +42,20 @@ bool _LA_count_check=true;
extern "C" void _findme(void) {}; //for autoconf test we need a function with C linkage
void laerror(const char *s1)
void laerror2(const char *s1, const char *s2)
{
std::cerr << "LA:ERROR - ";
std::cout << "LA:ERROR - ";
if(s1)
{
std::cerr << s1 << "\n";
std::cout << s1 << "\n";
std::cerr << s2 << ": " << s1 << "\n";
std::cout << s2 << ": " << s1 << "\n";
}
#ifdef CUDALA
{
cublasStatus s=cublasGetError();
std::cerr << "CUBLAS status = "<<s<<std::endl;
std::cout << "CUBLAS status = "<<s<<std::endl;
cublasStatus s = cublasGetError();
std::cerr << "CUBLAS status = " << s << std::endl;
std::cout << "CUBLAS status = " << s << std::endl;
}
#endif
if(errno) perror("system error");
@ -63,21 +63,19 @@ std::cout << "CUBLAS status = "<<s<<std::endl;
throw LAerror(s1);
}
//stub for f77 blas called from strassen routine
extern "C" void xerbla_(const char name[6], int *n)
{
char msg[1024];
strcpy(msg,"LAPACK or BLAS error in routine ");
strncat(msg,name,6);
sprintf(msg+strlen(msg),": illegal value of parameter #%d",*n);
laerror(msg);
extern "C" void xerbla_(const char name[6], int *n){
char msg[1024];
strcpy(msg,"LAPACK or BLAS error in routine ");
strncat(msg,name,6);
sprintf(msg+strlen(msg),": illegal value of parameter #%d",*n);
laerror(msg);
}
//with atlas-cblas another error routine is necessary
extern "C" void ATL_xerbla(int p, char *rout, char *form, ...)
{
extern "C" void ATL_xerbla(int p, char *rout, char *form, ...){
char msg0[1024], *msg;
va_list argptr;
va_start(argptr, form);
@ -89,8 +87,7 @@ extern "C" void ATL_xerbla(int p, char *rout, char *form, ...)
laerror(msg0);
}
int cblas_errprn(int ierr, int info, char *form, ...)
{
int cblas_errprn(int ierr, int info, char *form, ...) {
char msg0[1024], *msg;
va_list argptr;
va_start(argptr, form);
@ -99,7 +96,7 @@ int cblas_errprn(int ierr, int info, char *form, ...)
vsprintf(msg, form, argptr);
va_end(argptr);
laerror(msg0);
return 0;
return 0;
}
}//namespace

View File

@ -29,10 +29,15 @@ class LAerror
LAerror(const char *s) {msg=s;};
};
extern void laerror(const char *);
#ifdef __GNUG__
#define laerror(X) { LA::laerror2(X, __PRETTY_FUNCTION__); }
#else
#define laerror(X) { LA::laerror2(X, __func__); }
#endif
inline std::ostream & operator<<(std::ostream &s, const LAerror &x)
{
extern void laerror2(const char *, const char *);
inline std::ostream & operator<<(std::ostream &s, const LAerror &x) {
s << x.msg;
return s;
}

3154
mat.cc

File diff suppressed because it is too large Load Diff

1440
mat.h

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,5 @@
/*
/* vim: set ts=8 sw=8 sts=8 noexpandtab cindent: */
/*******************************************************************************
LA: linear algebra C++ interface library
Copyright (C) 2008 Jiri Pittner <jiri.pittner@jh-inst.cas.cz> or <jiri@pittnerovi.com>
@ -14,8 +15,7 @@
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
*******************************************************************************/
#include "noncblas.h"
#include "laerror.h"
@ -27,178 +27,156 @@
//Level 1 - straightforward wrappers
extern "C" double FORNAME(ddot) (const FINT *n, const double *x, const FINT *incx, const double *y, const FINT *incy);
double cblas_ddot(const int N, const double *X, const int incX,
const double *Y, const int incY)
{
double cblas_ddot(const int N, const double *X, const int incX, const double *Y, const int incY){
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
return FORNAME(ddot)(&ntmp,X,&incxtmp,Y,&incytmp);
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
return FORNAME(ddot)(&ntmp,X,&incxtmp,Y,&incytmp);
#else
return FORNAME(ddot)(&N,X,&incX,Y,&incY);
return FORNAME(ddot)(&N,X,&incX,Y,&incY);
#endif
}
extern "C" void FORNAME(dscal) (const FINT *n, const double *a, double *x, const FINT *incx);
void cblas_dscal(const int N, const double alpha, double *X, const int incX)
{
void cblas_dscal(const int N, const double alpha, double *X, const int incX){
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
FORNAME(dscal) (&ntmp,&alpha,X,&incxtmp);
const FINT ntmp=N;
const FINT incxtmp=incX;
FORNAME(dscal) (&ntmp,&alpha,X,&incxtmp);
#else
FORNAME(dscal) (&N,&alpha,X,&incX);
FORNAME(dscal) (&N,&alpha,X,&incX);
#endif
}
extern "C" void FORNAME(dcopy) (const FINT *n, const double *x, const FINT *incx, double *y, const FINT *incy);
void cblas_dcopy(const int N, const double *X, const int incX,
double *Y, const int incY)
{
void cblas_dcopy(const int N, const double *X, const int incX, double *Y, const int incY){
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(dcopy) (&ntmp,X,&incxtmp,Y,&incytmp);
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(dcopy) (&ntmp,X,&incxtmp,Y,&incytmp);
#else
FORNAME(dcopy) (&N,X,&incX,Y,&incY);
FORNAME(dcopy) (&N,X,&incX,Y,&incY);
#endif
}
extern "C" void FORNAME(daxpy) (const FINT *n, const double *a, const double *x, const FINT *incx, double *y, const FINT *incy);
void cblas_daxpy(const int N, const double alpha, const double *X,
const int incX, double *Y, const int incY)
{
void cblas_daxpy(const int N, const double alpha, const double *X, const int incX, double *Y, const int incY){
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(daxpy) (&ntmp,&alpha,X,&incxtmp,Y,&incytmp);
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(daxpy) (&ntmp,&alpha,X,&incxtmp,Y,&incytmp);
#else
FORNAME(daxpy) (&N,&alpha,X,&incX,Y,&incY);
FORNAME(daxpy) (&N,&alpha,X,&incX,Y,&incY);
#endif
}
extern "C" double FORNAME(dnrm2) (const FINT *n, const double *x, const FINT *incx);
double cblas_dnrm2(const int N, const double *X, const int incX)
{
double cblas_dnrm2(const int N, const double *X, const int incX){
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
return FORNAME(dnrm2) (&ntmp,X,&incxtmp);
const FINT ntmp=N;
const FINT incxtmp=incX;
return FORNAME(dnrm2) (&ntmp,X,&incxtmp);
#else
return FORNAME(dnrm2) (&N,X,&incX);
return FORNAME(dnrm2) (&N,X,&incX);
#endif
}
extern "C" double FORNAME(dasum) (const FINT *n, const double *x, const FINT *incx);
double cblas_dasum(const int N, const double *X, const int incX)
{
double cblas_dasum(const int N, const double *X, const int incX){
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
return FORNAME(dasum) (&ntmp,X,&incxtmp);
const FINT ntmp=N;
const FINT incxtmp=incX;
return FORNAME(dasum) (&ntmp,X,&incxtmp);
#else
return FORNAME(dasum) (&N,X,&incX);
return FORNAME(dasum) (&N,X,&incX);
#endif
}
extern "C" void FORNAME(zcopy) (const FINT *n, const void *x, const FINT *incx, void *y, const FINT *incy);
void cblas_zcopy(const int N, const void *X, const int incX,
void *Y, const int incY)
{
void cblas_zcopy(const int N, const void *X, const int incX, void *Y, const int incY){
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(zcopy) (&ntmp,X,&incxtmp,Y,&incytmp);
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(zcopy) (&ntmp,X,&incxtmp,Y,&incytmp);
#else
FORNAME(zcopy) (&N,X,&incX,Y,&incY);
FORNAME(zcopy) (&N,X,&incX,Y,&incY);
#endif
}
extern "C" void FORNAME(zaxpy) (const FINT *n, const void *a, const void *x, const FINT *incx, void *y, const FINT *incy);
void cblas_zaxpy(const int N, const void *alpha, const void *X,
const int incX, void *Y, const int incY)
{
void cblas_zaxpy(const int N, const void *alpha, const void *X, const int incX, void *Y, const int incY){
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(zaxpy) (&ntmp,alpha,X,&incxtmp,Y,&incytmp);
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(zaxpy) (&ntmp,alpha,X,&incxtmp,Y,&incytmp);
#else
FORNAME(zaxpy) (&N,alpha,X,&incX,Y,&incY);
FORNAME(zaxpy) (&N,alpha,X,&incX,Y,&incY);
#endif
}
extern "C" void FORNAME(zscal) (const FINT *n, const void *a, void *x, const FINT *incx);
void cblas_zscal(const int N, const void *alpha, void *X, const int incX)
{
void cblas_zscal(const int N, const void *alpha, void *X, const int incX){
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
FORNAME(zscal)(&ntmp,alpha,X,&incxtmp);
const FINT ntmp=N;
const FINT incxtmp=incX;
FORNAME(zscal)(&ntmp,alpha,X,&incxtmp);
#else
FORNAME(zscal)(&N,alpha,X,&incX);
FORNAME(zscal)(&N,alpha,X,&incX);
#endif
}
extern "C" void FORNAME(zdscal) (const FINT *n, const double *a, void *x, const FINT *incx);
void cblas_zdscal(const int N, const double alpha, void *X, const int incX)
{
void cblas_zdscal(const int N, const double alpha, void *X, const int incX){
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
FORNAME(zdscal)(&ntmp,&alpha,X,&incxtmp);
const FINT ntmp=N;
const FINT incxtmp=incX;
FORNAME(zdscal)(&ntmp,&alpha,X,&incxtmp);
#else
FORNAME(zdscal)(&N,&alpha,X,&incX);
FORNAME(zdscal)(&N,&alpha,X,&incX);
#endif
}
extern "C" double FORNAME(dznrm2) (const FINT *n, const void *x, const FINT *incx);
double cblas_dznrm2(const int N, const void *X, const int incX)
{
double cblas_dznrm2(const int N, const void *X, const int incX){
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
return FORNAME(dznrm2) (&ntmp,X,&incxtmp);
const FINT ntmp=N;
const FINT incxtmp=incX;
return FORNAME(dznrm2) (&ntmp,X,&incxtmp);
#else
return FORNAME(dznrm2) (&N,X,&incX);
return FORNAME(dznrm2) (&N,X,&incX);
#endif
}
//the following ones are f2c-compatible, but is it truly portable???
extern "C" void FORNAME(zdotu) (void *retval, const FINT *n, const void *x, const FINT *incx, const void *y, const FINT *incy);
void cblas_zdotu_sub(const int N, const void *X, const int incX,
const void *Y, const int incY, void *dotu)
{
void cblas_zdotu_sub(const int N, const void *X, const int incX, const void *Y, const int incY, void *dotu){
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(zdotu) (dotu,&ntmp,X,&incxtmp,Y,&incytmp);
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(zdotu) (dotu,&ntmp,X,&incxtmp,Y,&incytmp);
#else
FORNAME(zdotu) (dotu,&N,X,&incX,Y,&incY);
FORNAME(zdotu) (dotu,&N,X,&incX,Y,&incY);
#endif
}
extern "C" void FORNAME(zdotc) (void *retval, const FINT *n, const void *x, const FINT *incx, const void *y, const FINT *incy);
void cblas_zdotc_sub(const int N, const void *X, const int incX,
const void *Y, const int incY, void *dotc)
{
void cblas_zdotc_sub(const int N, const void *X, const int incX, const void *Y, const int incY, void *dotc){
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(zdotc) (dotc,&ntmp,X,&incxtmp,Y,&incytmp);
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(zdotc) (dotc,&ntmp,X,&incxtmp,Y,&incytmp);
#else
FORNAME(zdotc) (dotc,&N,X,&incX,Y,&incY);
FORNAME(zdotc) (dotc,&N,X,&incX,Y,&incY);
#endif
}
@ -215,15 +193,15 @@ void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
const double *X, const int incX,
const double beta, double *Y, const int incY)
{
if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
if(Uplo!=CblasLower) LA::laerror("CblasLower uplo asserted");
if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
if(Uplo!=CblasLower) laerror("CblasLower uplo asserted");
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(dspmv) ("U",&ntmp, &alpha, Ap, X, &incxtmp, &beta, Y, &incytmp);
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(dspmv) ("U",&ntmp, &alpha, Ap, X, &incxtmp, &beta, Y, &incytmp);
#else
FORNAME(dspmv) ("U",&N, &alpha, Ap, X, &incX, &beta, Y, &incY);
FORNAME(dspmv) ("U",&N, &alpha, Ap, X, &incX, &beta, Y, &incY);
#endif
}
@ -234,15 +212,15 @@ void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
const void *X, const int incX,
const void *beta, void *Y, const int incY)
{
if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
if(Uplo!=CblasLower) LA::laerror("CblasLower uplo asserted");
if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
if(Uplo!=CblasLower) laerror("CblasLower uplo asserted");
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(zhpmv) ("U",&ntmp, alpha, Ap, X, &incxtmp, beta, Y, &incytmp);
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(zhpmv) ("U",&ntmp, alpha, Ap, X, &incxtmp, beta, Y, &incytmp);
#else
FORNAME(zhpmv) ("U",&N, alpha, Ap, X, &incX, beta, Y, &incY);
FORNAME(zhpmv) ("U",&N, alpha, Ap, X, &incX, beta, Y, &incY);
#endif
}
@ -254,17 +232,17 @@ void cblas_dger(const enum CBLAS_ORDER Order, const int M, const int N,
const double alpha, const double *X, const int incX,
const double *Y, const int incY, double *A, const int lda)
{
if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
//swap m-n, y-x
if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
//swap m-n, y-x
#ifdef FORINT
const FINT mtmp=M;
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
const FINT ldatmp=lda;
FORNAME(dger) (&ntmp, &mtmp, &alpha, Y, &incytmp, X, &incxtmp, A, &ldatmp);
const FINT mtmp=M;
const FINT ntmp=N;
const FINT incxtmp=incX;
const FINT incytmp=incY;
const FINT ldatmp=lda;
FORNAME(dger) (&ntmp, &mtmp, &alpha, Y, &incytmp, X, &incxtmp, A, &ldatmp);
#else
FORNAME(dger) (&N, &M, &alpha, Y, &incY, X, &incX, A, &lda);
FORNAME(dger) (&N, &M, &alpha, Y, &incY, X, &incX, A, &lda);
#endif
}
@ -272,14 +250,14 @@ void cblas_zgerc(const enum CBLAS_ORDER Order, const int M, const int N,
const void *alpha, const void *X, const int incX,
const void *Y, const int incY, void *A, const int lda)
{
LA::laerror("cblas_zgerc cannot be simply converted to fortran order");
laerror("cblas_zgerc cannot be simply converted to fortran order");
}
void cblas_zgeru(const enum CBLAS_ORDER Order, const int M, const int N,
const void *alpha, const void *X, const int incX,
const void *Y, const int incY, void *A, const int lda)
{
LA::laerror("cblas_zgeru cannot be simply converted to fortran order");
laerror("cblas_zgeru cannot be simply converted to fortran order");
}
extern "C" void FORNAME(dgemm) (const char *transa, const char *transb, const FINT *m, const FINT *n, const FINT *k, const double *alpha, const double *a, const FINT *lda, const double *b, const FINT *ldb, const double *beta, double *c, const FINT *ldc);
@ -289,19 +267,19 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
const int lda, const double *B, const int ldb,
const double beta, double *C, const int ldc)
{
if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
//swap a-b, m-n
if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
//swap a-b, m-n
#ifdef FORINT
const FINT mtmp=M;
const FINT ntmp=N;
const FINT ktmp=K;
const FINT ldatmp=lda;
const FINT ldbtmp=ldb;
const FINT ldctmp=ldc;
FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T",
const FINT mtmp=M;
const FINT ntmp=N;
const FINT ktmp=K;
const FINT ldatmp=lda;
const FINT ldbtmp=ldb;
const FINT ldctmp=ldc;
FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T",
&ntmp, &mtmp, &ktmp, &alpha, B, &ldbtmp, A, &ldatmp, &beta, C, &ldctmp);
#else
FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T",
FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T",
&N, &M, &K, &alpha, B, &ldb, A, &lda, &beta, C, &ldc);
#endif
}
@ -313,20 +291,20 @@ void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
const int lda, const void *B, const int ldb,
const void *beta, void *C, const int ldc)
{
if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
//swap a-b, m-n
if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
//swap a-b, m-n
#ifdef FORINT
const FINT mtmp=M;
const FINT ntmp=N;
const FINT ktmp=K;
const FINT ldatmp=lda;
const FINT ldbtmp=ldb;
const FINT ldctmp=ldc;
FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
const FINT mtmp=M;
const FINT ntmp=N;
const FINT ktmp=K;
const FINT ldatmp=lda;
const FINT ldbtmp=ldb;
const FINT ldctmp=ldc;
FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
&ntmp, &mtmp, &ktmp, alpha, B, &ldbtmp, A, &ldatmp, beta, C, &ldctmp);
#else
FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
&N, &M, &K, alpha, B, &ldb, A, &lda, beta, C, &ldc);
#endif
@ -341,18 +319,18 @@ void cblas_dgemv(const enum CBLAS_ORDER Order,
double *Y, const int incY)
{
#ifdef FORINT
const FINT mtmp=M;
const FINT ntmp=N;
const FINT ldatmp=lda;
const FINT incxtmp=incX;
const FINT incytmp=incY;
if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
//swap n-m and toggle transposition
else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
const FINT mtmp=M;
const FINT ntmp=N;
const FINT ldatmp=lda;
const FINT incxtmp=incX;
const FINT incytmp=incY;
if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
//swap n-m and toggle transposition
else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
#else
if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
//swap n-m and toggle transposition
else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
//swap n-m and toggle transposition
else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
#endif
}
@ -364,36 +342,68 @@ void cblas_zgemv(const enum CBLAS_ORDER Order,
const void *X, const int incX, const void *beta,
void *Y, const int incY)
{
if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
if(TransA == CblasConjTrans) LA::laerror("zgemv with CblasConjTrans not supportted");
//swap n-m and toggle transposition
if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
if(TransA == CblasConjTrans) laerror("zgemv with CblasConjTrans not supportted");
//swap n-m and toggle transposition
#ifdef FORINT
const FINT mtmp=M;
const FINT ntmp=N;
const FINT ldatmp=lda;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, alpha, A, &ldatmp, X, &incxtmp, beta, Y, &incytmp );
const FINT mtmp=M;
const FINT ntmp=N;
const FINT ldatmp=lda;
const FINT incxtmp=incX;
const FINT incytmp=incY;
FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, alpha, A, &ldatmp, X, &incxtmp, beta, Y, &incytmp );
#else
FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, alpha, A, &lda, X, &incX, beta, Y, &incY );
FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, alpha, A, &lda, X, &incX, beta, Y, &incY );
#endif
}
extern "C" FINT FORNAME(idamax) (const FINT *N, const double *DX, const FINT *INCX);
CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX)
{
CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX) {
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
return (CBLAS_INDEX)FORNAME(idamax)(&ntmp,X,&incxtmp);
const FINT ntmp=N;
const FINT incxtmp=incX;
return (CBLAS_INDEX)FORNAME(idamax)(&ntmp,X,&incxtmp);
#else
return (CBLAS_INDEX)FORNAME(idamax)(&N,X,&incX);
return (CBLAS_INDEX)FORNAME(idamax)(&N,X,&incX);
#endif
}
extern "C" FINT FORNAME(izamax) (const FINT *N, const void *DX, const FINT *INCX);
CBLAS_INDEX cblas_izamax(const int N, const void *X, const int incX) {
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
return (CBLAS_INDEX)FORNAME(izamax)(&ntmp, X, &incxtmp);
#else
return (CBLAS_INDEX)FORNAME(izamax)(&N, X, &incX);
#endif
}
/*
extern "C" FINT FORNAME(idamin) (const FINT *N, const double *DX, const FINT *INCX);
CBLAS_INDEX cblas_idamin(const int N, const double *X, const int incX) {
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
return (CBLAS_INDEX)FORNAME(idamin)(&ntmp,X,&incxtmp);
#else
return (CBLAS_INDEX)FORNAME(idamin)(&N,X,&incX);
#endif
}
extern "C" FINT FORNAME(izamin) (const FINT *N, const void *DX, const FINT *INCX);
CBLAS_INDEX cblas_izamin(const int N, const void *X, const int incX) {
#ifdef FORINT
const FINT ntmp=N;
const FINT incxtmp=incX;
return (CBLAS_INDEX)FORNAME(izamin)(&ntmp, X, &incxtmp);
#else
return (CBLAS_INDEX)FORNAME(izamin)(&N, X, &incX);
#endif
}
*/
#endif
#ifdef NONCLAPACK
//clapack_dgesv
@ -404,32 +414,47 @@ int clapack_dgesv(const enum CBLAS_ORDER Order, const int N, const int NRHS,
double *A, const int lda, int *ipiv,
double *B, const int ldb)
{
FINT INFO=0;
if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
//B should be in the same physical order, just transpose A in place and the LU result on output
for(int i=1; i<N; ++i) for(int j=0; j<i; ++j) {double t=A[j*lda+i]; A[j*lda+i]=A[i*lda+j]; A[i*lda+j]=t;}
FINT INFO=0;
if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
//B should be in the same physical order, just transpose A in place and the LU result on output
for(int i=1; i<N; ++i) for(int j=0; j<i; ++j) {double t=A[j*lda+i]; A[j*lda+i]=A[i*lda+j]; A[i*lda+j]=t;}
#ifdef FORINT
const FINT ntmp=N;
const FINT nrhstmp=NRHS;
const FINT ldatmp=lda;
const FINT ldbtmp=ldb;
FINT ipivtmp=*ipiv;
FORNAME(dgesv) (&ntmp,&nrhstmp,A,&ldatmp,&ipivtmp,B,&ldbtmp,&INFO);
const FINT ntmp=N;
const FINT nrhstmp=NRHS;
const FINT ldatmp=lda;
const FINT ldbtmp=ldb;
FINT ipivtmp=*ipiv;
FORNAME(dgesv) (&ntmp,&nrhstmp,A,&ldatmp,&ipivtmp,B,&ldbtmp,&INFO);
#else
FORNAME(dgesv) (&N,&NRHS,A,&lda,ipiv,B,&ldb,&INFO);
FORNAME(dgesv) (&N,&NRHS,A,&lda,ipiv,B,&ldb,&INFO);
#endif
for(int i=1; i<N; ++i) for(int j=0; j<i; ++j) {double t=A[j*lda+i]; A[j*lda+i]=A[i*lda+j]; A[i*lda+j]=t;}
return (int)INFO;
for(int i=1; i<N; ++i) for(int j=0; j<i; ++j) {double t=A[j*lda+i]; A[j*lda+i]=A[i*lda+j]; A[i*lda+j]=t;}
return (int)INFO;
}
#endif
void cblas_dswap(const int N, double *X, const int incX,
double *Y, const int incY){
LA::laerror("cblas_dswap is not available... (-DNONCBLAS)");
extern "C" void FORNAME(dswap)(const FINT *N, double *X, const FINT *incX, double *Y, const FINT *incY);
void cblas_dswap(const int N, double *X, const int incX, double *Y, const int incY){
#ifdef FORINT
const FINT N_tmp = N;
const FINT incX_tmp = incX;
const FINT incY_tmp = incY;
FORNAME(dswap)(&N_tmp, X, &incX_tmp, Y, &incY_tmp);
#else
FORNAME(dswap)(&N, X, &incX, Y, &incY);
#endif
}
void cblas_zswap(const int N, void *X, const int incX,
void *Y, const int incY){
LA::laerror("cblas_zswap is not available... (-DNONCBLAS)");
extern "C" void FORNAME(zswap)(const FINT *N, void *X, const FINT *incX, void *Y, const FINT *incY);
void cblas_zswap(const int N, void *X, const int incX, void *Y, const int incY){
#ifdef FORINT
const FINT N_tmp = N;
const FINT incX_tmp = incX;
const FINT incY_tmp = incY;
FORNAME(zswap)(&N_tmp, X, &incX_tmp, Y, &incY_tmp);
#else
FORNAME(zswap)(&N, X, &incX, Y, &incY);
#endif
}

View File

@ -85,6 +85,11 @@ CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX);
CBLAS_INDEX cblas_icamax(const int N, const void *X, const int incX);
CBLAS_INDEX cblas_izamax(const int N, const void *X, const int incX);
/*
CBLAS_INDEX cblas_idamin(const int N, const double *X, const int incX);
CBLAS_INDEX cblas_izamin(const int N, const void *X, const int incX);
*/
/*
* ===========================================================================
* Prototypes for level 1 BLAS routines

688
smat.cc
View File

@ -1,3 +1,6 @@
//------------------------------------------------------------------------------
/* vim: set ts=8 sw=8 sts=8 noexpandtab cindent: */
//------------------------------------------------------------------------------
/*
LA: linear algebra C++ interface library
Copyright (C) 2008 Jiri Pittner <jiri.pittner@jh-inst.cas.cz> or <jiri@pittnerovi.com>
@ -25,50 +28,50 @@
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>
extern "C" {
extern ssize_t read(int, void *, size_t);
extern ssize_t write(int, const void *, size_t);
extern ssize_t read(int, void *, size_t);
extern ssize_t write(int, const void *, size_t);
}
// TODO
// specialize unary minus
namespace LA {
/*
* * Templates first, specializations for BLAS next
*
*/
//raw I/O
/***************************************************************************//**
* routine for raw output
* @param[in] fd file descriptor for output
* @param[in] dim number of elements intended for output
* @param[in] transp reserved
* @see NRMat<T>::get(), NRSMat<T>::copyonwrite()
******************************************************************************/
template <typename T>
void NRSMat<T>::put(int fd, bool dim, bool transp) const
{
void NRSMat<T>::put(int fd, bool dim, bool transp) const {
#ifdef CUDALA
if(location!=cpu)
{
if(location != cpu){
NRSMat<T> tmp= *this;
tmp.moveto(cpu);
tmp.put(fd,dim,transp);
return;
}
#endif
errno=0;
if(dim)
{
if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write");
if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write");
}
LA_traits<T>::multiput(NN2,fd,v,dim);
errno = 0;
if(dim){
if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write");
if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write");
}
LA_traits<T>::multiput(NN2,fd,v,dim);
}
/***************************************************************************//**
* routine for raw input
* @param[in] fd file descriptor for input
* @param[in] dim number of elements intended for input
* @param[in] transp reserved
* @see NRSMat<T>::put(), NRSMat<T>::copyonwrite()
******************************************************************************/
template <typename T>
void NRSMat<T>::get(int fd, bool dim, bool transp)
{
void NRSMat<T>::get(int fd, bool dim, bool transp) {
#ifdef CUDALA
if(location!=cpu)
{
if(location != cpu){
NRSMat<T> tmp;
tmp.moveto(cpu);
tmp.get(fd,dim,transp);
@ -78,166 +81,306 @@ if(location!=cpu)
}
#endif
int nn0[2]; //align at least 8-byte
errno=0;
if(dim)
{
if(2*sizeof(int) != read(fd,&nn0,2*sizeof(int))) laerror("cannot read");
resize(nn0[0]);
}
else
copyonwrite();
LA_traits<T>::multiget(NN2,fd,v,dim);
int nn0[2]; //align at least 8-byte
errno = 0;
if(dim){
if(2*sizeof(int) != read(fd,&nn0,2*sizeof(int))) laerror("cannot read");
resize(nn0[0]);
}else{
copyonwrite();
}
LA_traits<T>::multiget(NN2,fd,v,dim);
}
// conversion ctor, symmetrize general Mat into SMat
/***************************************************************************//**
* constructor symmetrizing given matrix \f$A\f$ of general type <code>T</code> yielding \f$(A+A^\mathrm{T})/2\f$
* @param[in] rhs matrix \f$A\f$
******************************************************************************/
template <typename T>
NRSMat<T>::NRSMat(const NRMat<T> &rhs)
{
nn=rhs.nrows();
NRSMat<T>::NRSMat(const NRMat<T> &rhs) {
NOT_GPU(rhs);
nn = rhs.nrows();
#ifdef DEBUG
if (nn != rhs.ncols()) laerror("attempt to convert non-square Mat to SMat");
if(nn != rhs.ncols()) laerror("attempt to convert nonsquare NRMat<T> to NRSMat<T>");
#endif
#ifdef CUDALA
location = rhs.getlocation();
#endif
count = new int;
*count = 1;
v = new T[NN2];
int i, j, k=0;
for (i=0; i<nn; i++)
for (j=0; j<=i;j++) v[k++] = (rhs[i][j] + rhs[j][i])/((T)2);
int i, j, k(0);
for(i=0; i<nn; i++){
for(j=0; j<=i; j++){
v[k++] = (rhs[i][j] + rhs[j][i])/((T)2);
}
}
}
// assign to diagonal
/***************************************************************************//**
* zero out this symmetric matrix of general type <code>T</code> and then set
* the diagonal elements to prescribed value
* @param[in] a scalar value to be assigned to the diagonal
* @return reference to the modified matrix
******************************************************************************/
template <typename T>
NRSMat<T> & NRSMat<T>::operator=(const T &a)
{
NRSMat<T> & NRSMat<T>::operator=(const T &a) {
NOT_GPU(*this);
copyonwrite();
memset(v,0,NN2*sizeof(T));
for (int i=0; i<nn; i++) v[i*(i+1)/2+i] = a;
memset(v, 0, NN2*sizeof(T));
for(register int i=0; i<nn; i++) v[i*(i+1)/2 + i] = a;
return *this;
}
//get diagonal
/***************************************************************************//**
* get or divide by the diagonal of real symmetric double-precision matrix
* @param[in, out] r vector for storing the diagonal
* @param[in] divide
* \li \c false save the diagonal to vector r
* \li \c true divide the vector r by the diagonal elements element-wise
* @param[in] cache reserved
* @return
* \li <tt>divide == true</tt> NULL
* \li <tt>divide == false</tt> pointer to the first element of r
******************************************************************************/
template <typename T>
const T* NRSMat<T>::diagonalof(NRVec<T> &r, const bool divide, bool cache) const
{
const T* NRSMat<T>::diagonalof(NRVec<T> &r, const bool divide, bool cache) const {
#ifdef DEBUG
if(r.size()!=nn) laerror("incompatible vector in diagonalof()");
if(r.size() != nn) laerror("incompatible vector in const T* NRSMat<T>::diagonalof(NRVec<T> &, const bool, bool)");
#endif
NOT_GPU(*this);
SAME_LOC(*this, r);
r.copyonwrite();
r.copyonwrite();
if (divide)
for (int i=0; i<nn; i++) {T a =v[i*(i+1)/2+i]; if(a!=0.) r[i] /= a;}
else
for (int i=0; i<nn; i++) r[i] = v[i*(i+1)/2+i];
return divide?NULL:&r[0];
if(divide){
for(register int i=0; i<nn; i++){
const T a = v[i*(i+1)/2+i];
if(a != 0.) r[i] /= a;
}
}else{
for(register int i=0; i<nn; i++) r[i] = v[i*(i+1)/2+i];
}
return divide?NULL:&r[0];
}
// unary minus
/***************************************************************************//**
* implements unary minus operator for this symmetric
* matrix of general type <code>T</code>
* @return modified copy of this matrix
******************************************************************************/
template <typename T>
const NRSMat<T> NRSMat<T>::operator-() const
{
NRSMat<T> result(nn);
for(int i=0; i<NN2; i++) result.v[i]= -v[i];
const NRSMat<T> NRSMat<T>::operator-() const {
NOT_GPU(*this);
NRSMat<T> result(nn, getlocation());
for(register int i = 0; i<NN2; i++) result.v[i]= -v[i];
return result;
}
// trace of Smat
/***************************************************************************//**
* implements unary minus operator for this real symmetric matrix
* @return modified copy of this matrix
******************************************************************************/
template <>
const NRSMat<double> NRSMat<double>::operator-() const {
NRSMat<double> result(nn, getlocation());
#ifdef CUDALA
if(location == cpu){
#endif
memcpy(result.v, v, NN2*sizeof(double));
cblas_dscal(NN2, -1., result.v, 1);
#ifdef CUDALA
}else{
cublasDcopy(NN2, v, 1, result.v, 1);
TEST_CUBLAS("cublasDcopy");
cublasDscal(NN2, -1., result.v, 1);
TEST_CUBLAS("cublasDscal");
}
#endif
return result;
}
/***************************************************************************//**
* implements unary minus operator for this hermitian matrix
* @return modified copy of this matrix
******************************************************************************/
template <>
const NRSMat<complex<double> > NRSMat<complex<double> >::operator-() const {
NRSMat<complex<double> > result(nn, getlocation());
#ifdef CUDALA
if(location == cpu) {
#endif
memcpy(result.v, v, NN2*sizeof(complex<double>));
cblas_zscal(NN2, &CMONE, result.v, 1);
#ifdef CUDALA
}else{
cublasZcopy(NN2, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)result.v, 1);
TEST_CUBLAS("cublasZcopy");
cublasZscal(NN2, CUMONE, (cuDoubleComplex*)result.v, 1);
TEST_CUBLAS("cublasZscal");
}
#endif
return result;
}
/***************************************************************************//**
* @return the sum of the diagonal elements
******************************************************************************/
template <typename T>
const T NRSMat<T>::trace() const
{
const T NRSMat<T>::trace() const {
NOT_GPU(*this);
T tmp = 0;
for (int i=0; i<nn; i++) tmp += v[i*(i+1)/2+i];
for(register int i=0; i<nn; i++) tmp += v[i*(i+1)/2+i];
return tmp;
}
/***************************************************************************//**
* fill this real symmetric matrix with
* pseudorandom numbers generated from uniform distribution
******************************************************************************/
template<>
void NRSMat<double>::randomize(const double &x)
{
for(int i=0; i<NN2; ++i) v[i] = x*(2.*random()/(1.+RAND_MAX) -1.);
void NRSMat<double>::randomize(const double &x) {
NOT_GPU(*this);
for(int i=0; i<NN2; ++i){
v[i] = x*(2.*random()/(1.+RAND_MAX) -1.);
}
}
/***************************************************************************//**
* Fill this hermitian matrix with pseudorandom numbers generated from uniform
* distribution. The real and imaginary parts are generated independently.
******************************************************************************/
template<>
void NRSMat<complex<double> >::randomize(const double &x)
{
for(int i=0; i<NN2; ++i) v[i].real() = x*(2.*random()/(1.+RAND_MAX) -1.);
for(int i=0; i<NN2; ++i) v[i].imag() = x*(2.*random()/(1.+RAND_MAX) -1.);
for(int i=0; i<nn; ++i) for(int j=0; j<=i; ++j) if(i==j) v[i*(i+1)/2+j].imag()=0; //hermitean
void NRSMat<complex<double> >::randomize(const double &x) {
for(register int i=0; i<NN2; ++i) v[i].real() = x*(2.*random()/(1. + RAND_MAX) -1.);
for(register int i=0; i<NN2; ++i) v[i].imag() = x*(2.*random()/(1. + RAND_MAX) -1.);
for(register int i=0; i<nn; ++i){
for(register int j=0; j<=i; ++j){
if(i == j) v[i*(i+1)/2+j].imag() = 0; //hermitean
}
}
}
// write matrix to the file with specific format
/***************************************************************************//**
* routine for formatted output via lawritemat
* @param[in] file pointer to <tt>FILE</tt> structure representing the output file
* @param[in] format format specification in standard printf-like form
* @param[in] modulo
* @see lawritemat()
******************************************************************************/
template <typename T>
void NRSMat<T>::fprintf(FILE *file, const char *format, const int modulo) const
{
void NRSMat<T>::fprintf(FILE *file, const char *format, const int modulo) const {
NOT_GPU(*this);
lawritemat(file, (const T *)(*this) ,nn, nn, format, 2, modulo, 1);
}
// read matrix from the file with specific format
/***************************************************************************//**
* routine for formatted input via fscanf
* @param[in] f pointer to <tt>FILE</tt> structure representing the input file
* @param[in] format format specification in standard printf-like form
******************************************************************************/
template <typename T>
void NRSMat<T>::fscanf(FILE *f, const char *format)
{
void NRSMat<T>::fscanf(FILE *f, const char *format) {
int n, m;
if (::fscanf(f,"%d %d",&n,&m) != 2)
laerror("cannot read matrix dimensions in SMat::fscanf");
if (n != m) laerror("different dimensions of SMat");
NOT_GPU(*this);
if (::fscanf(f,"%d %d", &n, &m) != 2)
laerror("cannot read matrix dimensions in NRSMat<T>::fscanf(FILE *, const char *)");
if (n != m) laerror("different dimensions in NRSMat<T>::fscanf(FILE *, const char *)");
resize(n);
for (int i=0; i<n; i++)
for (int j=0; j<n; j++)
if (::fscanf(f,format,&((*this)(i,j))) != 1)
laerror("Smat - cannot read matrix element");
laerror("NRSMat<T>::fscanf(FILE *, const char *) - unable to read matrix element");
}
/*
* BLAS specializations for double and complex<double>
*/
// SMat * Mat
//NOTE: dsymm is not appropriate as it works on UNPACKED symmetric matrix
/***************************************************************************//**
* multiply this real double-precision symmetric matrix \f$S\f$ stored in packed form
* with real double-precision dense matrix \f$A\f$
* @param[in] rhs real double-precision matrix \f$A\f$
* @return matrix produt \f$S\times{}A\f$
******************************************************************************/
template<>
const NRMat<double> NRSMat<double>::operator*(const NRMat<double> &rhs) const
{
const NRMat<double> NRSMat<double>::operator*(const NRMat<double> &rhs) const {
#ifdef DEBUG
if (nn != rhs.nrows()) laerror("incompatible dimensions in SMat*Mat");
if(nn != rhs.nrows()) laerror("incompatible dimensions in NRMat<double> NRSMat<double>::operator*(const NRMat<double> &)");
#endif
SAME_LOC(*this, rhs);
NRMat<double> result(nn, rhs.ncols(), getlocation());
#ifdef CUDALA
if(location == cpu){
#endif
for(register int k = 0; k<rhs.ncols(); k++){
cblas_dspmv(CblasRowMajor, CblasLower, nn, 1.0, v, rhs[0] + k, rhs.ncols(), 0.0, result[0] + k, rhs.ncols());
}
#ifdef CUDALA
}else{
for(register int k = 0; k<rhs.ncols(); k++){
cublasDspmv('U', nn, 1.0, v, rhs[0] + k, rhs.ncols(), 0.0, result[0] + k, rhs.ncols());
TEST_CUBLAS("cublasDspmv");
}
}
#endif
NRMat<double> result(nn, rhs.ncols());
for (int k=0; k<rhs.ncols(); k++)
cblas_dspmv(CblasRowMajor, CblasLower, nn, 1.0, v, rhs[0]+k, rhs.ncols(),
0.0, result[0]+k, rhs.ncols());
return result;
}
/***************************************************************************//**
* multiply this real double-precision symmetric matrix \f$S\f$ stored in packed form
* with real double-precision dense matrix \f$A\f$
* @param[in] rhs real double-precision matrix \f$A\f$
* @return matrix produt \f$S\times{}A\f$
******************************************************************************/
template<>
const NRMat< complex<double> >
NRSMat< complex<double> >::operator*(const NRMat< complex<double> > &rhs) const
{
const NRMat<complex<double> >
NRSMat<complex<double> >::operator*(const NRMat<complex<double> > &rhs) const {
#ifdef DEBUG
if (nn != rhs.nrows()) laerror("incompatible dimensions in SMat*Mat");
if (nn != rhs.nrows()) laerror("incompatible dimensions in NRSMat<complex<double> >::operator*(const NRMat<complex<double> > &)");
#endif
SAME_LOC(*this, rhs);
NRMat<complex<double> > result(nn, rhs.ncols(), getlocation());
#ifdef CUDALA
if(location == cpu){
#endif
for(register int k=0; k<rhs.ncols(); k++){
cblas_zhpmv(CblasRowMajor, CblasLower, nn, &CONE, v, rhs[0]+k, rhs.ncols(), &CZERO, result[0]+k, rhs.ncols());
}
#ifdef CUDALA
}else{
for(register int k = 0; k<rhs.ncols(); k++){
cublasZhpmv('U', nn,
CUONE, (cuDoubleComplex*)v, (cuDoubleComplex*)(rhs[0] + k), rhs.ncols(),
CUZERO, (cuDoubleComplex*)(result[0] + k), rhs.ncols());
TEST_CUBLAS("cublasDspmv");
}
}
#endif
NRMat< complex<double> > result(nn, rhs.ncols());
for (int k=0; k<rhs.ncols(); k++)
cblas_zhpmv(CblasRowMajor, CblasLower, nn, &CONE, v, rhs[0]+k, rhs.ncols(),
&CZERO, result[0]+k, rhs.ncols());
return result;
}
// SMat * SMat
/***************************************************************************//**
* multiply this real double-precision symmetric matrix \f$S\f$ stored in packed form
* with real double-precision symmetric matrix \f$T\f$
* @return matrix produt \f$S\times{}T\f$ (not necessarily symmetric)
******************************************************************************/
template<>
const NRMat<double> NRSMat<double>::operator*(const NRSMat<double> &rhs) const
{
const NRMat<double> NRSMat<double>::operator*(const NRSMat<double> &rhs) const {
#ifdef DEBUG
if (nn != rhs.nn) laerror("incompatible dimensions in SMat*SMat");
if (nn != rhs.nn) laerror("incompatible dimensions in NRMat<double> NRSMat<double>::operator*(const NRSMat<double> &)");
#endif
NRMat<double> result(0.0, nn, nn);
double *p, *q;
@ -283,156 +426,295 @@ const NRMat<double> NRSMat<double>::operator*(const NRSMat<double> &rhs) const
}
/***************************************************************************//**
* multiply this complex double-precision symmetric matrix \f$G\f$ stored in packed form
* with complex double-precision symmetric matrix \f$H\f$
* @return matrix produt \f$G\times{}H\f$ (not necessarily symmetric)
******************************************************************************/
template<>
const NRMat< complex<double> >
NRSMat< complex<double> >::operator*(const NRSMat< complex<double> > &rhs) const
{
const NRMat<complex<double> >
NRSMat<complex<double> >::operator*(const NRSMat<complex<double> > &rhs) const {
#ifdef DEBUG
if (nn != rhs.nn) laerror("incompatible dimensions in SMat*SMat");
if (nn != rhs.nn) laerror("incompatible dimensions in NRSMat<complex<double> >::operator*(const NRSMat<complex<double> > &)");
#endif
NRMat< complex<double> > result(0.0, nn, nn);
NRMat< complex<double> > rhsmat(rhs);
SAME_LOC(*this, rhs);
NRMat<complex<double> > result(nn, nn, getlocation());
NRMat<complex<double> > rhsmat(rhs);
result = *this * rhsmat;
return result;
// laerror("complex SMat*Smat not implemented");
}
// S dot S
/***************************************************************************//**
* compute inner product of this real symmetric matrix \f$A\f$ with given real symmetric matrix \f$B\f$
* i.e. determine the value of
* \f[\sum_{i,j}A_{i,j}B_{i,j}\f]
* @param[in] rhs matrix \f$B\f$
* @return computed inner product
******************************************************************************/
template<>
const double NRSMat<double>::dot(const NRSMat<double> &rhs) const
{
const double NRSMat<double>::dot(const NRSMat<double> &rhs) const {
double ret(0.);
#ifdef DEBUG
if (nn != rhs.nn) laerror("dot of incompatible SMat's");
if (nn != rhs.nn) laerror("incompatible dimensions in double NRSMat<double>::dot(const NRSMat<double> &)");
#endif
return cblas_ddot(NN2, v, 1, rhs.v, 1);
SAME_LOC(*this, rhs);
#ifdef CUDALA
if(location == cpu){
#endif
ret = cblas_ddot(NN2, v, 1, rhs.v, 1);
#ifdef CUDALA
}else{
ret = cublasDdot(NN2, v, 1, rhs.v, 1);
}
#endif
return ret;
}
/***************************************************************************//**
* compute inner product of this complex symmetric matrix \f$A\f$ with given complex symmetric matrix \f$B\f$
* i.e. determine the value of
* \f[\sum_{i,j}\overbar{A_{i,j}}B_{i,j}\f]
* @param[in] rhs matrix \f$B\f$
* @return computed inner product
******************************************************************************/
template<>
const complex<double>
NRSMat< complex<double> >::dot(const NRSMat< complex<double> > &rhs) const
{
const complex<double> NRSMat<complex<double> >::dot(const NRSMat<complex<double> > &rhs) const {
#ifdef DEBUG
if (nn != rhs.nn) laerror("dot of incompatible SMat's");
if (nn != rhs.nn) laerror("incompatible dimensions in complex<double> NRSMat<complex<double> >::dot(const NRSMat<complex<double> > &)");
#endif
complex<double> dot(0., 0.);
SAME_LOC(*this, rhs);
#ifdef CUDALA
if(location == cpu){
#endif
complex<double> dot;
cblas_zdotc_sub(NN2, v, 1, rhs.v, 1, &dot);
#ifdef CUDALA
}else{
const cuDoubleComplex _dot = cublasZdotc(NN2, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)(rhs.v), 1);
dot = complex<double>(cuCreal(_dot), cuCimag(_dot));
TEST_CUBLAS("cublasZdotc");
}
#endif
return dot;
}
/***************************************************************************//**
* compute inner product of this real double-precision symmetric matrix \f$S\f$ of order \f$n\f$
* with given real double-precision vector \f$\vec{v}\f$ of length \f$n(n+1)/2\f$
* @param[in] rhs real double-precision vector \f$\vec{v}\f$
* @return computed inner product
******************************************************************************/
template<>
const double NRSMat<double>::dot(const NRVec<double> &rhs) const
{
const double NRSMat<double>::dot(const NRVec<double> &rhs) const {
double ret(0.0);
#ifdef DEBUG
if (NN2 != rhs.nn) laerror("dot of incompatible SMat's");
if(NN2 != rhs.nn) laerror("incompatible dimensions in double NRSMat<double>::dot(const NRVec<double> &)");
#endif
SAME_LOC(*this, rhs);
#ifdef CUDALA
if(location == cpu){
#endif
ret = cblas_ddot(NN2, v, 1, rhs.v, 1);
#ifdef CUDALA
}else{
ret = cublasDdot(NN2, v, 1, rhs.v, 1);
TEST_CUBLAS("cublasDdot");
}
#endif
return cblas_ddot(NN2, v, 1, rhs.v, 1);
}
/***************************************************************************//**
* compute inner product of this complex double-precision hermitian matrix \f$H\f$ of order \f$n\f$
* with given complex double-precision vector \f$\vec{v}\f$ of length \f$n(n+1)/2\f$
* @param[in] rhs complex double-precision vector \f$\vec{v}\f$
* @return computed inner product
******************************************************************************/
template<>
const complex<double>
NRSMat< complex<double> >::dot(const NRVec< complex<double> > &rhs) const
{
NRSMat<complex<double> >::dot(const NRVec<complex<double> > &rhs) const {
#ifdef DEBUG
if (NN2 != rhs.nn) laerror("dot of incompatible SMat's");
if(NN2 != rhs.nn) laerror("incompatible dimensions in complex<double> NRSMat<complex<double> >::dot(const NRVec<complex<double> > &)");
#endif
complex<double> dot(0., 0.);
SAME_LOC(*this, rhs);
#ifdef CUDALA
if(location == cpu){
#endif
complex<double> dot;
cblas_zdotc_sub(NN2, v, 1, rhs.v, 1, &dot);
#ifdef CUDALA
}else{
const cuDoubleComplex _dot = cublasZdotc(NN2, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)rhs.v, 1);
TEST_CUBLAS("cublasZdotc");
dot = complex<double>(cuCreal(_dot), cuCimag(_dot));
}
#endif
return dot;
}
// norm of the matrix
/***************************************************************************//**
* compute the Frobenius norm of this real double-precision symmetric matrix
* @param[in] scalar subtract this scalar value from the diagonal elements before the norm computation
******************************************************************************/
template<>
const double NRSMat<double>::norm(const double scalar) const
{
if (!scalar) return cblas_dnrm2(NN2, v, 1);
double sum = 0;
int k = 0;
for (int i=0; i<nn; ++i)
for (int j=0; j<=i; ++j) {
register double tmp;
tmp = v[k++];
if (i == j) tmp -= scalar;
const double NRSMat<double>::norm(const double scalar) const {
if(!scalar){
double ret(0.);
#ifdef CUDALA
if(location == cpu){
#endif
ret = cblas_dnrm2(NN2, v, 1);
#ifdef CUDALA
}else{
ret = cublasDnrm2(NN2, v, 1);
TEST_CUBLAS("cublasDnrm2");
}
#endif
return ret;
}
NOT_GPU(*this);
double sum(0.);
int k(0);
for(register int i=0; i<nn; ++i){
for(register int j=0; j<=i; ++j) {
register double tmp = v[k++];
if(i == j) tmp -= scalar;
sum += tmp*tmp;
}
}
return std::sqrt(sum);
}
/***************************************************************************//**
* compute the Frobenius norm of this complex double-precision hermitian matrix
* @param[in] scalar subtract this scalar value from the diagonal elements before the norm computation
******************************************************************************/
template<>
const double NRSMat< complex<double> >::norm(const complex<double> scalar) const
{
if (!(scalar.real()) && !(scalar.imag()))
return cblas_dznrm2(NN2, v, 1);
double sum = 0;
const double NRSMat< complex<double> >::norm(const complex<double> scalar) const {
if(!(scalar.real()) && !(scalar.imag())){
double ret(0.);
#ifdef CUDALA
if(location == cpu){
#endif
ret = cblas_dznrm2(NN2, v, 1);
#ifdef CUDALA
}else{
ret = cublasDznrm2(NN2, (cuDoubleComplex*)v, 1);
TEST_CUBLAS("cublasDznrm2");
}
#endif
return ret;
}
int k(0);
double sum(0.);
complex<double> tmp;
int k = 0;
for (int i=0; i<nn; ++i)
for (int j=0; j<=i; ++j) {
for(register int i=0; i<nn; ++i){
for(register int j=0; j<=i; ++j){
tmp = v[k++];
if (i == j) tmp -= scalar;
sum += tmp.real()*tmp.real() + tmp.imag()*tmp.imag();
}
}
return std::sqrt(sum);
}
// axpy: S = S * a
/***************************************************************************//**
* for this real double-precision symmetric matrix \f$S\f$ stored in packed form,
* real scalar value \f$\alpha\f$ and real double-precision symmetric matrix \f$T\f$, compute
* \f[S \leftarrow \alpha T + S\f]
******************************************************************************/
template<>
void NRSMat<double>::axpy(const double alpha, const NRSMat<double> & x)
{
void NRSMat<double>::axpy(const double alpha, const NRSMat<double> &x) {
#ifdef DEBUG
if (nn != x.nn) laerror("axpy of incompatible SMats");
if(nn != x.nn) laerror("incompatible dimensions in void NRSMat<double>::axpy(const double, const NRSMat<double>&)");
#endif
SAME_LOC(*this, x);
copyonwrite();
#ifdef CUDALA
if(location == cpu){
#endif
cblas_daxpy(NN2, alpha, x.v, 1, v, 1);
}
template<>
void NRSMat< complex<double> >::axpy(const complex<double> alpha,
const NRSMat< complex<double> > & x)
{
#ifdef DEBUG
if (nn != x.nn) laerror("axpy of incompatible SMats");
#ifdef CUDALA
}else{
cublasDaxpy(NN2, alpha, x.v, 1, v, 1);
TEST_CUBLAS("cublasDaxpy");
}
#endif
copyonwrite();
cblas_zaxpy(nn, &alpha, x.v, 1, v, 1);
}
//complex from real
/***************************************************************************//**
* for this complex double-precision hermitian matrix \f$H\f$ stored in packed form,
* complex scalar value \f$\alpha\f$ and complex double-precision hermitian matrix \f$G\f$, compute
* \f[H \leftarrow \alpha G + H\f]
******************************************************************************/
template<>
NRSMat<complex<double> >::NRSMat(const NRSMat<double> &rhs, bool imagpart)
: nn(rhs.nrows()), v(new complex<double>[rhs.nrows()*(rhs.nrows()+1)/2]), count(new int(1))
{
memset(v,0,nn*(nn+1)/2*sizeof(complex<double>));
cblas_dcopy(nn*(nn+1)/2,&rhs(0,0),1,((double *)v) + (imagpart?1:0),2);
void NRSMat<complex<double> >::axpy(const complex<double> alpha, const NRSMat<complex<double> > & x) {
#ifdef DEBUG
if(nn != x.nn) laerror("incompatible dimensions in void NRSMat<complex<double> >::axpy(const complex<double> , const NRSMat<complex<double> >&)");
#endif
SAME_LOC(*this, x);
copyonwrite();
#ifdef CUDALA
if(location == cpu){
#endif
cblas_zaxpy(nn, &alpha, x.v, 1, v, 1);
#ifdef CUDALA
}else{
const cuDoubleComplex _alpha = make_cuDoubleComplex(alpha.real(), alpha.imag());
cublasZaxpy(NN2, _alpha, (cuDoubleComplex*)x.v, 1, (cuDoubleComplex*)v, 1);
TEST_CUBLAS("cublasZaxpy");
}
#endif
}
/***************************************************************************//**
* create hermitian matrix \f$H\f$ from given real double-precision symmetric
* matrix \f$S\f$
* @param[in] rhs real double-precision symmetric matrix \f$S\f$
* @param[in] imagpart flag determining whether \f$S\f$ should correspond to the real or imaginary part of \f$H\f$
******************************************************************************/
template<>
NRSMat<complex<double> >::NRSMat(const NRSMat<double> &rhs, bool imagpart): nn(rhs.nrows()), count(new int(1)) {
//inconsistent in general case?
const int nnp1 = nn*(nn + 1)/2;
#ifdef CUDALA
location = rhs.getlocation();
if(location == cpu){
#endif
v = new complex<double>[nnp1];
memset(v, 0, nnp1*sizeof(complex<double>));
cblas_dcopy(nnp1, &rhs(0, 0), 1, ((double *)v) + (imagpart?1:0), 2);
#ifdef CUDALA
}else{
v = (complex<double>*) gpualloc(nnp1*sizeof(complex<double>));
//some template specializations leading to BLAS/CUBLAS calls
complex<double> *_val = gpuputcomplex(CZERO);
cublasZcopy(nnp1, (cuDoubleComplex*)_val, 0, (cuDoubleComplex*)v, 1);
TEST_CUBLAS("cublasZcopy");
gpufree(_val);
cublasDcopy(nnp1, (double*)(&rhs(0,0)), 1, ((double*)v) + (imagpart?1:0), 2);
TEST_CUBLAS("cublasDcopy");
}
#endif
}
//////////////////////////////////////////////////////////////////////////////
////// forced instantization in the corresponding object file
/***************************************************************************//**
* forced instantization in the corresponding object file
******************************************************************************/
template class NRSMat<double>;
template class NRSMat< complex<double> >;
template class NRSMat<complex<double> >;
template class NRSMat<long long>;
template class NRSMat<long>;

1153
smat.h

File diff suppressed because it is too large Load Diff

1245
vec.cc

File diff suppressed because it is too large Load Diff

1584
vec.h

File diff suppressed because it is too large Load Diff