cuda

2010-09-08 16:27:58 +00:00
parent e8cbf9e5fb
commit e580467e5a
14 changed files with 7106 additions and 3691 deletions
--- a/cuda_la.cc
+++ b/cuda_la.cc
@@ -1,3 +1,6 @@
+/* vim: set ts=8 sw=8 sts=8 noexpandtab cindent: */
+/*******************************************************************************
+*******************************************************************************/
 #include "la_traits.h"
 #include "cuda_la.h"

@@ -7,63 +10,54 @@ namespace LA {

 GPUID DEFAULT_LOC = cpu;

-void set_default_loc(const GPUID loc) 
-{
-DEFAULT_LOC = loc;
+void set_default_loc(const GPUID loc){
+	DEFAULT_LOC = loc;
 }

-void *gpualloc(size_t size)
-{
-cublasStatus status;
-void *ptr=NULL;
-status = cublasAlloc(size,1,&ptr);
-if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc");
-return ptr;
+void *gpualloc(size_t size){
+	void *ptr = NULL;
+	cublasAlloc(size, 1, &ptr);
+	TEST_CUBLAS("cublasAlloc");
+	return ptr;
 }


-void gpufree(void *ptr)
-{
-cublasStatus status = cublasFree(ptr);
-if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasFree");
+void gpufree(void *ptr){
+	cublasFree(ptr);
+	TEST_CUBLAS("cublasFree");
 }

-void gpuget(size_t n, size_t elsize, const void *from, void *to)
-{
-cublasStatus status;
-status=cublasGetVector(n,elsize,from,1,to,1);
-if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasGetVector");
+void gpuget(size_t n, size_t elsize, const void *from, void *to){
+	cublasGetVector(n, elsize, from, 1, to, 1);
+	TEST_CUBLAS("cublasGetVector");
 }

-void gpuput(size_t n, size_t elsize, const void *from, void *to)
-{
-cublasStatus status;
-status=cublasSetVector(n,elsize,from,1,to,1);
-if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector");
+void gpuput(size_t n, size_t elsize, const void *from, void *to){
+	cublasSetVector(n, elsize, from, 1, to, 1);
+	TEST_CUBLAS("cublasSetVector");
 }

-double *gpuputdouble(const double &x)
-{
-cublasStatus status;
-void *ptr=NULL;
-status = cublasAlloc(1,sizeof(double),&ptr);
-if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc");
-status=cublasSetVector(1,sizeof(double),&x,1,ptr,1);
-if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector");
-return (double *)ptr;
+double *gpuputdouble(const double &x){
+	void *ptr = NULL;
+	cublasAlloc(1, sizeof(double), &ptr);
+	TEST_CUBLAS("cublasAlloc");
+
+	cublasSetVector(1, sizeof(double), &x, 1, ptr, 1);
+	TEST_CUBLAS("cublasSetVector");
+
+	return (double *)ptr;
 }

-complex<double> *gpuputcomplex(const complex<double> &x)
-{
-cublasStatus status;
-void *ptr=NULL;
-status = cublasAlloc(1,sizeof(complex<double>),&ptr);
-if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc");
-status=cublasSetVector(1,sizeof(complex<double>),&x,1,ptr,1);
-if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector");
-return (complex<double> *)ptr;
-}
+complex<double> *gpuputcomplex(const complex<double> &x){
+	void *ptr = NULL;
+	cublasAlloc(1, sizeof(complex<double>), &ptr);
+	TEST_CUBLAS("cublasAlloc");

+	cublasSetVector(1, sizeof(complex<double>), &x, 1, ptr, 1);
+	TEST_CUBLAS("cublasSetVector");
+
+	return (complex<double> *)ptr;
+}


 }
--- a/cuda_la.h
+++ b/cuda_la.h
@@ -1,6 +1,10 @@
+//------------------------------------------------------------------------------
+/* vim: set ts=8 sw=8 sts=8 noexpandtab cindent: */
+//------------------------------------------------------------------------------
 #ifndef _CUDA_LA_H
 #define _CUDA_LA_H

+#include <errno.h>
 #ifdef CUDALA
 #undef MATPTR
 #include "cublas.h"
@@ -13,6 +17,7 @@ namespace LA {
 #ifdef CUDALA
 #define CPU_GPU(x,y) {if((x)!=cpu && (y)!=cpu) laerror("one operand must be in CPU memory");}
 #define NOT_GPU(x) {if((x).getlocation()!=cpu) laerror("Operation not implemented on GPU (yet). Use .moveto(0) first.");}
+#define NOT_CPU(x) {if((x).getlocation()==cpu) laerror("Operation not implemented on CPU (yet). Use .moveto(>0) first.");}
 #define SAME_LOC(x,y) {if((x).getlocation()!=(y).getlocation()) laerror("Operands have different location. Use .moveto() first.");}
 #define SAME_LOC3(x,y,z) {if((x).getlocation()!=(y).getlocation() || (x).getlocation()!=(z).getlocation()) laerror("Operands have different location. Use .moveto() first.");}
 #else
@@ -22,6 +27,16 @@ namespace LA {
 #define SAME_LOC3(x,y,z) {}
 #endif

+#ifdef DEBUG
+	#ifdef __GNUG__
+		#define TEST_CUBLAS(X) { if(cublasGetError() != CUBLAS_STATUS_SUCCESS){ laerror2(#X, __PRETTY_FUNCTION__); } }
+	#else
+		#define TEST_CUBLAS(X) { if(cublasGetError() != CUBLAS_STATUS_SUCCESS){ laerror2(#X, __func__); } }
+	#endif
+#else
+	#define TEST_CUBLAS(X) {}
+#endif
+
 typedef enum {undefined=-1, cpu=0, gpu1=1, gpu2=2, gpu3=3, gpu4=4} GPUID;

 #ifdef CUDALA
@@ -33,6 +48,7 @@ public:
 		{
 		cublasStatus status = cublasInit();
 		if (status != CUBLAS_STATUS_SUCCESS) laerror("Cannot init GPU for CUBLAS");
+		errno = 0;
 		}
 	~GPU_START(void)
 		{
@@ -50,8 +66,41 @@ extern complex<double> *gpuputcomplex(const complex<double> &x);

 void set_default_loc(const GPUID loc);

-extern GPUID DEFAULT_LOC;
+template <typename T>
+void smart_gpu_set(size_t n, const T& val, void *gpu_to, size_t _step = 1){
+	void *ptr(NULL);
+	if(sizeof(T)%sizeof(float) != 0){ laerror("memory alignment error"); }

+	cublasAlloc(1, sizeof(T), &ptr);
+	TEST_CUBLAS("cublasAlloc");
+
+	cublasSetVector(1, sizeof(T), &val, 1, ptr, 1);
+	TEST_CUBLAS("cublasSetVector");
+
+	if(sizeof(T) == sizeof(float)){
+		cublasScopy(n, (float*)ptr, 0, ((float*)gpu_to), _step);
+		TEST_CUBLAS("cublasScopy");
+
+	}else if(sizeof(T) == sizeof(double)){
+		cublasDcopy(n, (double*)ptr, 0, ((double*)gpu_to), _step);
+		TEST_CUBLAS("cublasDcopy");
+
+	}else if(sizeof(T) == sizeof(complex<double>)){
+		cublasZcopy(n, (cuDoubleComplex*)ptr, 0, (cuDoubleComplex*)gpu_to, _step);
+		TEST_CUBLAS("cublasZcopy");
+
+	}else{
+		for(register int i=0; i<sizeof(T)/sizeof(float); i++){
+			cublasScopy(n, (float*)ptr + i, 0, ((float*)gpu_to) + i, sizeof(T)/sizeof(float)*_step);
+			TEST_CUBLAS("cublasScopy");
+		}
+	}
+
+	cublasFree(ptr);
+	TEST_CUBLAS("cublasFree");
+}
+
+extern GPUID DEFAULT_LOC;

 #endif
 }
--- a/la.h
+++ b/la.h
@@ -44,5 +44,12 @@
 #include "sparsesmat.h"
 #include "vec.h"

-
+using namespace LA;
+typedef NRMat<int> NRIMat;
+typedef NRMat<double> NRDMat;
+typedef NRMat<complex<double> > NRCMat;
+typedef NRVec<int> NRIVec;
+typedef NRVec<double> NRDVec;
+typedef NRVec<complex<double> > NRCVec;
 #endif /* _LA_H_ */
+
--- a/la_traits.h
+++ b/la_traits.h
@@ -32,6 +32,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <iostream>
+#include <limits>
 #include <complex>


--- a/laerror.cc
+++ b/laerror.cc
@@ -42,20 +42,20 @@ bool _LA_count_check=true;

 extern "C" void _findme(void) {}; //for autoconf test we need a function with C linkage

-void laerror(const char *s1)
+void laerror2(const char *s1, const char *s2)
 {
  std::cerr << "LA:ERROR - ";
  std::cout << "LA:ERROR - ";
  if(s1)
  {
-    std::cerr << s1  << "\n";
-    std::cout << s1  << "\n";
+    std::cerr << s2 << ": " << s1 << "\n";
+    std::cout << s2 << ": " << s1 << "\n";
  }
 #ifdef CUDALA
 {
-cublasStatus s=cublasGetError();
-std::cerr << "CUBLAS status = "<<s<<std::endl;
-std::cout << "CUBLAS status = "<<s<<std::endl;
+cublasStatus s = cublasGetError();
+std::cerr << "CUBLAS status = " << s << std::endl;
+std::cout << "CUBLAS status = " << s << std::endl;
 }
 #endif
  if(errno) perror("system error");
@@ -63,43 +63,40 @@ std::cout << "CUBLAS status = "<<s<<std::endl;
 throw LAerror(s1); 
 }

+
 //stub for f77 blas called from strassen routine
-extern "C" void xerbla_(const char name[6], int *n)
-{
-char msg[1024];
-strcpy(msg,"LAPACK or BLAS error in routine ");
-strncat(msg,name,6);
-sprintf(msg+strlen(msg),": illegal value of parameter #%d",*n);
-laerror(msg);
+extern "C" void xerbla_(const char name[6], int *n){
+	char msg[1024];
+	strcpy(msg,"LAPACK or BLAS error in routine ");
+	strncat(msg,name,6);
+	sprintf(msg+strlen(msg),": illegal value of parameter #%d",*n);
+	laerror(msg);
 }


 //with atlas-cblas another error routine is necessary
-
-extern "C" void ATL_xerbla(int p, char *rout, char *form, ...)
-{
-  char msg0[1024], *msg;
-   va_list argptr;
-   va_start(argptr, form);
-   strcpy(msg0,"ATLAS error\n");
-   msg=msg0+strlen(msg0);
-   if (p) {sprintf(msg, "Parameter %d to routine %s was incorrect\n", p, rout); msg+=strlen(msg);}
-   vsprintf(msg, form, argptr);
-   va_end(argptr);
-   laerror(msg0);
+extern "C" void ATL_xerbla(int p, char *rout, char *form, ...){
+	char msg0[1024], *msg;
+	va_list argptr;
+	va_start(argptr, form);
+	strcpy(msg0,"ATLAS error\n");
+	msg=msg0+strlen(msg0);
+	if (p) {sprintf(msg, "Parameter %d to routine %s was incorrect\n", p, rout); msg+=strlen(msg);}
+	vsprintf(msg, form, argptr);
+	va_end(argptr);
+	laerror(msg0);
 }

-int cblas_errprn(int ierr, int info, char *form, ...)
-{
-  char msg0[1024], *msg;
-   va_list argptr;
-   va_start(argptr, form);
-   sprintf(msg0,"CBLAS error %d %d\n",ierr,info);
-   msg=msg0+strlen(msg0);
-   vsprintf(msg, form, argptr);
-   va_end(argptr);
-   laerror(msg0);
-return 0;
+int cblas_errprn(int ierr, int info, char *form, ...) {
+	char msg0[1024], *msg;
+	va_list argptr;
+	va_start(argptr, form);
+	sprintf(msg0,"CBLAS error %d %d\n",ierr,info);
+	msg=msg0+strlen(msg0);
+	vsprintf(msg, form, argptr);
+	va_end(argptr);
+	laerror(msg0);
+	return 0;
 }

 }//namespace
--- a/laerror.h
+++ b/laerror.h
@@ -29,10 +29,15 @@ class LAerror
 	LAerror(const char *s) {msg=s;};
 	};

-extern void laerror(const char *);
+#ifdef __GNUG__
+	#define laerror(X) { LA::laerror2(X, __PRETTY_FUNCTION__); }
+#else
+	#define laerror(X) { LA::laerror2(X, __func__); }
+#endif

-inline std::ostream & operator<<(std::ostream &s, const LAerror &x)
-{
+extern void laerror2(const char *, const char *);
+
+inline std::ostream & operator<<(std::ostream &s, const LAerror &x) {
 s << x.msg;
 return s;
 }
--- a/mat.cc
+++ b/mat.cc
--- a/mat.h
+++ b/mat.h
--- a/noncblas.cc
+++ b/noncblas.cc
@@ -1,4 +1,5 @@
-/*
+/* vim: set ts=8 sw=8 sts=8 noexpandtab cindent: */
+/*******************************************************************************
    LA: linear algebra C++ interface library
    Copyright (C) 2008 Jiri Pittner <jiri.pittner@jh-inst.cas.cz> or <jiri@pittnerovi.com>

@@ -14,8 +15,7 @@

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
+*******************************************************************************/

 #include "noncblas.h"
 #include "laerror.h"
@@ -27,178 +27,156 @@
 //Level 1 - straightforward wrappers

 extern "C" double FORNAME(ddot) (const FINT *n, const double *x, const FINT *incx, const double *y, const FINT *incy);
-double cblas_ddot(const int N, const double *X, const int incX,
-                  const double *Y, const int incY)
-{
+double cblas_ddot(const int N, const double *X, const int incX, const double *Y, const int incY){
 #ifdef FORINT
-const FINT ntmp=N;
-const FINT incxtmp=incX;
-const FINT incytmp=incY;
-return FORNAME(ddot)(&ntmp,X,&incxtmp,Y,&incytmp);
+	const FINT ntmp=N;
+	const FINT incxtmp=incX;
+	const FINT incytmp=incY;
+	return FORNAME(ddot)(&ntmp,X,&incxtmp,Y,&incytmp);
 #else
-return FORNAME(ddot)(&N,X,&incX,Y,&incY);
+	return FORNAME(ddot)(&N,X,&incX,Y,&incY);
 #endif
 }

 extern "C" void FORNAME(dscal) (const FINT *n, const double *a, double *x, const FINT *incx);
-void cblas_dscal(const int N, const double alpha, double *X, const int incX)
-{
+void cblas_dscal(const int N, const double alpha, double *X, const int incX){
 #ifdef FORINT
-const FINT ntmp=N;
-const FINT incxtmp=incX;
-FORNAME(dscal) (&ntmp,&alpha,X,&incxtmp);
+	const FINT ntmp=N;
+	const FINT incxtmp=incX;
+	FORNAME(dscal) (&ntmp,&alpha,X,&incxtmp);
 #else
-FORNAME(dscal) (&N,&alpha,X,&incX);
+	FORNAME(dscal) (&N,&alpha,X,&incX);
 #endif
 }

 extern "C" void FORNAME(dcopy) (const FINT *n, const double *x, const FINT *incx, double *y, const FINT *incy);
-void cblas_dcopy(const int N, const double *X, const int incX,
-                 double *Y, const int incY)
-{
+void cblas_dcopy(const int N, const double *X, const int incX, double *Y, const int incY){
 #ifdef FORINT
-const FINT ntmp=N;
-const FINT incxtmp=incX;
-const FINT incytmp=incY;
-FORNAME(dcopy) (&ntmp,X,&incxtmp,Y,&incytmp);
+	const FINT ntmp=N;
+	const FINT incxtmp=incX;
+	const FINT incytmp=incY;
+	FORNAME(dcopy) (&ntmp,X,&incxtmp,Y,&incytmp);
 #else
-FORNAME(dcopy) (&N,X,&incX,Y,&incY);
+	FORNAME(dcopy) (&N,X,&incX,Y,&incY);
 #endif
 }

 extern "C" void FORNAME(daxpy) (const FINT *n, const double *a, const double *x, const FINT *incx, double *y, const FINT *incy);
-void cblas_daxpy(const int N, const double alpha, const double *X,
-                 const int incX, double *Y, const int incY)
-{
+void cblas_daxpy(const int N, const double alpha, const double *X, const int incX, double *Y, const int incY){
 #ifdef FORINT
-const FINT ntmp=N;
-const FINT incxtmp=incX;
-const FINT incytmp=incY;
-FORNAME(daxpy) (&ntmp,&alpha,X,&incxtmp,Y,&incytmp);
+	const FINT ntmp=N;
+	const FINT incxtmp=incX;
+	const FINT incytmp=incY;
+	FORNAME(daxpy) (&ntmp,&alpha,X,&incxtmp,Y,&incytmp);
 #else
-FORNAME(daxpy) (&N,&alpha,X,&incX,Y,&incY);
+	FORNAME(daxpy) (&N,&alpha,X,&incX,Y,&incY);
 #endif
 }

 extern "C" double FORNAME(dnrm2) (const FINT *n, const double *x, const FINT *incx);
-double cblas_dnrm2(const int N, const double *X, const int incX)
-{
+double cblas_dnrm2(const int N, const double *X, const int incX){
 #ifdef FORINT
-const FINT ntmp=N;
-const FINT incxtmp=incX;
-return FORNAME(dnrm2) (&ntmp,X,&incxtmp);
+	const FINT ntmp=N;
+	const FINT incxtmp=incX;
+	return FORNAME(dnrm2) (&ntmp,X,&incxtmp);
 #else
-return FORNAME(dnrm2) (&N,X,&incX);
+	return FORNAME(dnrm2) (&N,X,&incX);
 #endif
 }

 extern "C" double FORNAME(dasum) (const FINT *n, const double *x, const FINT *incx);
-double cblas_dasum(const int N, const double *X, const int incX)
-{
+double cblas_dasum(const int N, const double *X, const int incX){
 #ifdef FORINT
-const FINT ntmp=N;
-const FINT incxtmp=incX;
-return FORNAME(dasum) (&ntmp,X,&incxtmp);
+	const FINT ntmp=N;
+	const FINT incxtmp=incX;
+	return FORNAME(dasum) (&ntmp,X,&incxtmp);
 #else
-return FORNAME(dasum) (&N,X,&incX);
+	return FORNAME(dasum) (&N,X,&incX);
 #endif
 }

 extern "C" void FORNAME(zcopy) (const FINT *n, const void *x, const FINT *incx, void *y, const FINT *incy);
-void cblas_zcopy(const int N, const void *X, const int incX,
-                 void *Y, const int incY)
-{
+void cblas_zcopy(const int N, const void *X, const int incX, void *Y, const int incY){
 #ifdef FORINT
-const FINT ntmp=N;
-const FINT incxtmp=incX;
-const FINT incytmp=incY;
-FORNAME(zcopy) (&ntmp,X,&incxtmp,Y,&incytmp);
+	const FINT ntmp=N;
+	const FINT incxtmp=incX;
+	const FINT incytmp=incY;
+	FORNAME(zcopy) (&ntmp,X,&incxtmp,Y,&incytmp);
 #else
-FORNAME(zcopy) (&N,X,&incX,Y,&incY);
+	FORNAME(zcopy) (&N,X,&incX,Y,&incY);
 #endif
 }

 extern "C" void FORNAME(zaxpy) (const FINT *n, const void *a, const void *x, const FINT *incx, void *y, const FINT *incy);
-void cblas_zaxpy(const int N, const void *alpha, const void *X,
-                 const int incX, void *Y, const int incY)
-{
+void cblas_zaxpy(const int N, const void *alpha, const void *X, const int incX, void *Y, const int incY){
 #ifdef FORINT
-const FINT ntmp=N;
-const FINT incxtmp=incX;
-const FINT incytmp=incY;
-FORNAME(zaxpy) (&ntmp,alpha,X,&incxtmp,Y,&incytmp);
+	const FINT ntmp=N;
+	const FINT incxtmp=incX;
+	const FINT incytmp=incY;
+	FORNAME(zaxpy) (&ntmp,alpha,X,&incxtmp,Y,&incytmp);
 #else
-FORNAME(zaxpy) (&N,alpha,X,&incX,Y,&incY);
+	FORNAME(zaxpy) (&N,alpha,X,&incX,Y,&incY);
 #endif
 }

 extern "C" void FORNAME(zscal) (const FINT *n, const void *a, void *x, const FINT *incx);
-void cblas_zscal(const int N, const void *alpha, void *X, const int incX)
-{
+void cblas_zscal(const int N, const void *alpha, void *X, const int incX){
 #ifdef FORINT
-const FINT ntmp=N;
-const FINT incxtmp=incX;
-FORNAME(zscal)(&ntmp,alpha,X,&incxtmp);
+	const FINT ntmp=N;
+	const FINT incxtmp=incX;
+	FORNAME(zscal)(&ntmp,alpha,X,&incxtmp);
 #else
-FORNAME(zscal)(&N,alpha,X,&incX);
+	FORNAME(zscal)(&N,alpha,X,&incX);
 #endif
 }

 extern "C" void FORNAME(zdscal) (const FINT *n, const double *a, void *x, const FINT *incx);
-void cblas_zdscal(const int N, const double alpha, void *X, const int incX)
-{
+void cblas_zdscal(const int N, const double alpha, void *X, const int incX){
 #ifdef FORINT
-const FINT ntmp=N;
-const FINT incxtmp=incX;
-FORNAME(zdscal)(&ntmp,&alpha,X,&incxtmp);
+	const FINT ntmp=N;
+	const FINT incxtmp=incX;
+	FORNAME(zdscal)(&ntmp,&alpha,X,&incxtmp);
 #else
-FORNAME(zdscal)(&N,&alpha,X,&incX);
+	FORNAME(zdscal)(&N,&alpha,X,&incX);
 #endif
 }


 extern "C" double FORNAME(dznrm2) (const FINT *n, const void *x, const FINT *incx);
-double cblas_dznrm2(const int N, const void *X, const int incX)
-{
+double cblas_dznrm2(const int N, const void *X, const int incX){
 #ifdef FORINT
-const FINT ntmp=N;
-const FINT incxtmp=incX;
-return FORNAME(dznrm2) (&ntmp,X,&incxtmp);
+	const FINT ntmp=N;
+	const FINT incxtmp=incX;
+	return FORNAME(dznrm2) (&ntmp,X,&incxtmp);
 #else
-return FORNAME(dznrm2) (&N,X,&incX);
+	return FORNAME(dznrm2) (&N,X,&incX);
 #endif
 }


 //the following ones are f2c-compatible, but is it truly portable???
-
 extern "C" void FORNAME(zdotu) (void *retval, const FINT *n, const void *x, const FINT *incx, const void *y, const FINT *incy);
-
-void   cblas_zdotu_sub(const int N, const void *X, const int incX,
-                       const void *Y, const int incY, void *dotu)
-{
+void cblas_zdotu_sub(const int N, const void *X, const int incX, const void *Y, const int incY, void *dotu){
 #ifdef FORINT
-const FINT ntmp=N;
-const FINT incxtmp=incX;
-const FINT incytmp=incY;
-FORNAME(zdotu) (dotu,&ntmp,X,&incxtmp,Y,&incytmp);
+	const FINT ntmp=N;
+	const FINT incxtmp=incX;
+	const FINT incytmp=incY;
+	FORNAME(zdotu) (dotu,&ntmp,X,&incxtmp,Y,&incytmp);
 #else
-FORNAME(zdotu) (dotu,&N,X,&incX,Y,&incY);
+	FORNAME(zdotu) (dotu,&N,X,&incX,Y,&incY);
 #endif
 }


 extern "C" void FORNAME(zdotc) (void *retval, const FINT *n, const void *x, const FINT *incx, const void *y, const FINT *incy);
-void   cblas_zdotc_sub(const int N, const void *X, const int incX,
-                       const void *Y, const int incY, void *dotc)
-{
+void cblas_zdotc_sub(const int N, const void *X, const int incX, const void *Y, const int incY, void *dotc){
 #ifdef FORINT
-const FINT ntmp=N;
-const FINT incxtmp=incX;
-const FINT incytmp=incY;
-FORNAME(zdotc) (dotc,&ntmp,X,&incxtmp,Y,&incytmp);
+	const FINT ntmp=N;
+	const FINT incxtmp=incX;
+	const FINT incytmp=incY;
+	FORNAME(zdotc) (dotc,&ntmp,X,&incxtmp,Y,&incytmp);
 #else
-FORNAME(zdotc) (dotc,&N,X,&incX,Y,&incY);
+	FORNAME(zdotc) (dotc,&N,X,&incX,Y,&incY);
 #endif
 }

@@ -211,38 +189,38 @@ FORNAME(zdotc) (dotc,&N,X,&incX,Y,&incY);

 extern "C" void FORNAME(dspmv) (const char *uplo, const FINT *n, const double *alpha, const double *ap, const double *x, const FINT *incx, const double *beta, double *y, const FINT *incy);
 void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const int N, const double alpha, const double *Ap,
-                 const double *X, const int incX,
-                 const double beta, double *Y, const int incY)
+		const int N, const double alpha, const double *Ap,
+		const double *X, const int incX,
+		const double beta, double *Y, const int incY)
 {
-if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
-if(Uplo!=CblasLower) LA::laerror("CblasLower uplo asserted");
+	if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
+	if(Uplo!=CblasLower) laerror("CblasLower uplo asserted");
 #ifdef FORINT
-const FINT ntmp=N;
-const FINT incxtmp=incX;
-const FINT incytmp=incY;
-FORNAME(dspmv) ("U",&ntmp, &alpha, Ap, X, &incxtmp, &beta, Y, &incytmp);
+	const FINT ntmp=N;
+	const FINT incxtmp=incX;
+	const FINT incytmp=incY;
+	FORNAME(dspmv) ("U",&ntmp, &alpha, Ap, X, &incxtmp, &beta, Y, &incytmp);
 #else
-FORNAME(dspmv) ("U",&N, &alpha, Ap, X, &incX, &beta, Y, &incY);
+	FORNAME(dspmv) ("U",&N, &alpha, Ap, X, &incX, &beta, Y, &incY);
 #endif
 }


 extern "C" void FORNAME(zhpmv) (const char *uplo, const FINT *n, const void *alpha, const void *ap, const void *x, const FINT *incx, const void *beta, void *y, const FINT *incy);
 void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
-                 const int N, const void *alpha, const void *Ap,
-                 const void *X, const int incX,
-                 const void *beta, void *Y, const int incY)
+		const int N, const void *alpha, const void *Ap,
+		const void *X, const int incX,
+		const void *beta, void *Y, const int incY)
 {
-if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
-if(Uplo!=CblasLower) LA::laerror("CblasLower uplo asserted");
+	if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
+	if(Uplo!=CblasLower) laerror("CblasLower uplo asserted");
 #ifdef FORINT
-const FINT ntmp=N;
-const FINT incxtmp=incX;
-const FINT incytmp=incY;
-FORNAME(zhpmv) ("U",&ntmp, alpha, Ap, X, &incxtmp, beta, Y, &incytmp);
+	const FINT ntmp=N;
+	const FINT incxtmp=incX;
+	const FINT incytmp=incY;
+	FORNAME(zhpmv) ("U",&ntmp, alpha, Ap, X, &incxtmp, beta, Y, &incytmp);
 #else
-FORNAME(zhpmv) ("U",&N, alpha, Ap, X, &incX, beta, Y, &incY);
+	FORNAME(zhpmv) ("U",&N, alpha, Ap, X, &incX, beta, Y, &incY);
 #endif
 }

@@ -251,185 +229,232 @@ FORNAME(zhpmv) ("U",&N, alpha, Ap, X, &incX, beta, Y, &incY);

 extern "C" void FORNAME(dger) (const FINT *m, const FINT *n, const double *alpha, const double *x, const FINT *incx, const double *y, const FINT *incy, double *a, const FINT *lda);
 void cblas_dger(const enum CBLAS_ORDER Order, const int M, const int N,
-                const double alpha, const double *X, const int incX,
-                const double *Y, const int incY, double *A, const int lda)
+		const double alpha, const double *X, const int incX,
+		const double *Y, const int incY, double *A, const int lda)
 {
-if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
-//swap m-n, y-x
+	if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
+	//swap m-n, y-x
 #ifdef FORINT
-const FINT mtmp=M;
-const FINT ntmp=N;
-const FINT incxtmp=incX;
-const FINT incytmp=incY;
-const FINT ldatmp=lda;
-FORNAME(dger) (&ntmp, &mtmp, &alpha, Y, &incytmp, X, &incxtmp, A, &ldatmp);
+	const FINT mtmp=M;
+	const FINT ntmp=N;
+	const FINT incxtmp=incX;
+	const FINT incytmp=incY;
+	const FINT ldatmp=lda;
+	FORNAME(dger) (&ntmp, &mtmp, &alpha, Y, &incytmp, X, &incxtmp, A, &ldatmp);
 #else
-FORNAME(dger) (&N, &M, &alpha, Y, &incY, X, &incX, A, &lda);
+	FORNAME(dger) (&N, &M, &alpha, Y, &incY, X, &incX, A, &lda);
 #endif
 }

 void cblas_zgerc(const enum CBLAS_ORDER Order, const int M, const int N,
-                 const void *alpha, const void *X, const int incX,
-                 const void *Y, const int incY, void *A, const int lda)
+		const void *alpha, const void *X, const int incX,
+		const void *Y, const int incY, void *A, const int lda)
 {
-LA::laerror("cblas_zgerc cannot be simply converted to fortran order");
+	laerror("cblas_zgerc cannot be simply converted to fortran order");
 }

 void cblas_zgeru(const enum CBLAS_ORDER Order, const int M, const int N,
-                 const void *alpha, const void *X, const int incX,
-                 const void *Y, const int incY, void *A, const int lda)
+		const void *alpha, const void *X, const int incX,
+		const void *Y, const int incY, void *A, const int lda)
 {
-LA::laerror("cblas_zgeru cannot be simply converted to fortran order");
+	laerror("cblas_zgeru cannot be simply converted to fortran order");
 }

 extern "C" void FORNAME(dgemm) (const char *transa, const char *transb, const FINT *m, const FINT *n, const FINT *k, const double *alpha, const double *a, const FINT *lda, const double *b, const FINT *ldb, const double *beta, double *c, const FINT *ldc);
 void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
-                 const int K, const double alpha, const double *A,
-                 const int lda, const double *B, const int ldb,
-                 const double beta, double *C, const int ldc)
+		const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+		const int K, const double alpha, const double *A,
+		const int lda, const double *B, const int ldb,
+		const double beta, double *C, const int ldc)
 {
-if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
-//swap a-b, m-n
+	if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
+	//swap a-b, m-n
 #ifdef FORINT
-const FINT mtmp=M;
-const FINT ntmp=N;
-const FINT ktmp=K;
-const FINT ldatmp=lda;
-const FINT ldbtmp=ldb;
-const FINT ldctmp=ldc;
-FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T",
-	 &ntmp, &mtmp, &ktmp, &alpha, B, &ldbtmp, A, &ldatmp, &beta, C, &ldctmp);
+	const FINT mtmp=M;
+	const FINT ntmp=N;
+	const FINT ktmp=K;
+	const FINT ldatmp=lda;
+	const FINT ldbtmp=ldb;
+	const FINT ldctmp=ldc;
+	FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T",
+			&ntmp, &mtmp, &ktmp, &alpha, B, &ldbtmp, A, &ldatmp, &beta, C, &ldctmp);
 #else
-FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T",
-	 &N, &M, &K, &alpha, B, &ldb, A, &lda, &beta, C, &ldc);
+	FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T",
+			&N, &M, &K, &alpha, B, &ldb, A, &lda, &beta, C, &ldc);
 #endif
 }

 extern "C" void FORNAME(zgemm) (const char *transa, const char *transb, const FINT *m, const FINT *n, const FINT *k, const void *alpha, const void *a, const FINT *lda, const void *b, const FINT *ldb, const void *beta, void *c, const FINT *ldc);
 void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
-                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
-                 const int K, const void *alpha, const void *A,
-                 const int lda, const void *B, const int ldb,
-                 const void *beta, void *C, const int ldc)
+		const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
+		const int K, const void *alpha, const void *A,
+		const int lda, const void *B, const int ldb,
+		const void *beta, void *C, const int ldc)
 {
-if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
-//swap a-b, m-n
+	if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
+	//swap a-b, m-n
 #ifdef FORINT
-const FINT mtmp=M;
-const FINT ntmp=N;
-const FINT ktmp=K;
-const FINT ldatmp=lda;
-const FINT ldbtmp=ldb;
-const FINT ldctmp=ldc;
-FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
-		 TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
-         &ntmp, &mtmp, &ktmp, alpha, B, &ldbtmp, A, &ldatmp, beta, C, &ldctmp);
+	const FINT mtmp=M;
+	const FINT ntmp=N;
+	const FINT ktmp=K;
+	const FINT ldatmp=lda;
+	const FINT ldbtmp=ldb;
+	const FINT ldctmp=ldc;
+	FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
+			TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
+			&ntmp, &mtmp, &ktmp, alpha, B, &ldbtmp, A, &ldatmp, beta, C, &ldctmp);
 #else
-FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
-		 TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
-         &N, &M, &K, alpha, B, &ldb, A, &lda, beta, C, &ldc);
+	FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
+			TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
+			&N, &M, &K, alpha, B, &ldb, A, &lda, beta, C, &ldc);
 #endif
 }


 extern "C" void FORNAME(dgemv)  (const char *TRANS, const FINT *M, const FINT *N, const double *ALPHA, const double *A,  const FINT *LDA,  const double *X,  const FINT *INCX, const double *BETA, double *Y, const FINT *INCY);
 void cblas_dgemv(const enum CBLAS_ORDER Order,
-                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
-                 const double alpha, const double *A, const int lda,
-                 const double *X, const int incX, const double beta,
-                 double *Y, const int incY)
+		const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+		const double alpha, const double *A, const int lda,
+		const double *X, const int incX, const double beta,
+		double *Y, const int incY)
 {
 #ifdef FORINT
-const FINT mtmp=M;
-const FINT ntmp=N;
-const FINT ldatmp=lda;
-const FINT incxtmp=incX;
-const FINT incytmp=incY;
-if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
-//swap n-m and toggle transposition
-else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
+	const FINT mtmp=M;
+	const FINT ntmp=N;
+	const FINT ldatmp=lda;
+	const FINT incxtmp=incX;
+	const FINT incytmp=incY;
+	if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
+	//swap n-m and toggle transposition
+	else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
 #else
-if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
-//swap n-m and toggle transposition
-else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
+	if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
+	//swap n-m and toggle transposition
+	else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
 #endif
 }


 extern "C" void FORNAME(zgemv)  (const char *TRANS, const FINT *M, const FINT *N, const void *ALPHA, const void *A,  const FINT *LDA,  const void *X,  const FINT *INCX, const void *BETA, void *Y, const FINT *INCY);
 void cblas_zgemv(const enum CBLAS_ORDER Order,
-                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
-                 const void *alpha, const void *A, const int lda,
-                 const void *X, const int incX, const void *beta,
-                 void *Y, const int incY)
+		const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
+		const void *alpha, const void *A, const int lda,
+		const void *X, const int incX, const void *beta,
+		void *Y, const int incY)
 {
-if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
-if(TransA == CblasConjTrans) LA::laerror("zgemv with CblasConjTrans not supportted");
-//swap n-m and toggle transposition
+	if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
+	if(TransA == CblasConjTrans) laerror("zgemv with CblasConjTrans not supportted");
+	//swap n-m and toggle transposition
 #ifdef FORINT
-const FINT mtmp=M;
-const FINT ntmp=N;
-const FINT ldatmp=lda;
-const FINT incxtmp=incX;
-const FINT incytmp=incY;
-FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, alpha, A, &ldatmp, X, &incxtmp, beta, Y, &incytmp );
+	const FINT mtmp=M;
+	const FINT ntmp=N;
+	const FINT ldatmp=lda;
+	const FINT incxtmp=incX;
+	const FINT incytmp=incY;
+	FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, alpha, A, &ldatmp, X, &incxtmp, beta, Y, &incytmp );
 #else
-FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, alpha, A, &lda, X, &incX, beta, Y, &incY );
+	FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, alpha, A, &lda, X, &incX, beta, Y, &incY );
 #endif
 }

 extern "C" FINT FORNAME(idamax) (const FINT *N, const double *DX, const FINT *INCX);
-
-CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX)
-{
+CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX) {
 #ifdef FORINT
-const FINT ntmp=N;
-const FINT incxtmp=incX;
-return (CBLAS_INDEX)FORNAME(idamax)(&ntmp,X,&incxtmp);
+	const FINT ntmp=N;
+	const FINT incxtmp=incX;
+	return (CBLAS_INDEX)FORNAME(idamax)(&ntmp,X,&incxtmp);
 #else
-return (CBLAS_INDEX)FORNAME(idamax)(&N,X,&incX);
+	return (CBLAS_INDEX)FORNAME(idamax)(&N,X,&incX);
 #endif
 }

+extern "C" FINT FORNAME(izamax) (const FINT *N, const void *DX, const FINT *INCX);
+CBLAS_INDEX cblas_izamax(const int N, const void *X, const int incX) {
+#ifdef FORINT
+	const FINT ntmp=N;
+	const FINT incxtmp=incX;
+	return (CBLAS_INDEX)FORNAME(izamax)(&ntmp, X, &incxtmp);
+#else
+	return (CBLAS_INDEX)FORNAME(izamax)(&N, X, &incX);
+#endif
+}

+/*
+extern "C" FINT FORNAME(idamin) (const FINT *N, const double *DX, const FINT *INCX);
+CBLAS_INDEX cblas_idamin(const int N, const double *X, const int incX) {
+#ifdef FORINT
+	const FINT ntmp=N;
+	const FINT incxtmp=incX;
+	return (CBLAS_INDEX)FORNAME(idamin)(&ntmp,X,&incxtmp);
+#else
+	return (CBLAS_INDEX)FORNAME(idamin)(&N,X,&incX);
+#endif
+}
+
+extern "C" FINT FORNAME(izamin) (const FINT *N, const void *DX, const FINT *INCX);
+CBLAS_INDEX cblas_izamin(const int N, const void *X, const int incX) {
+#ifdef FORINT
+	const FINT ntmp=N;
+	const FINT incxtmp=incX;
+	return (CBLAS_INDEX)FORNAME(izamin)(&ntmp, X, &incxtmp);
+#else
+	return (CBLAS_INDEX)FORNAME(izamin)(&N, X, &incX);
+#endif
+}
+*/
 #endif

-#ifdef  NONCLAPACK
+
+#ifdef NONCLAPACK
 //clapack_dgesv
 //allocate auxiliary storage and transpose input and output quantities to fortran/C order
 extern "C" void FORNAME(dgesv) (const FINT *N, const FINT *NRHS, double *A, const FINT *LDA, FINT *IPIV, double *B, const FINT *LDB, FINT *INFO);

 int clapack_dgesv(const enum CBLAS_ORDER Order, const int N, const int NRHS,
-                  double *A, const int lda, int *ipiv,
-                  double *B, const int ldb)
+		double *A, const int lda, int *ipiv,
+		double *B, const int ldb)
 {
-FINT INFO=0;
-if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
-//B should be in the same physical order, just transpose A in place and the LU result on output
-for(int i=1; i<N; ++i) for(int j=0; j<i; ++j)  {double t=A[j*lda+i]; A[j*lda+i]=A[i*lda+j]; A[i*lda+j]=t;}
+	FINT INFO=0;
+	if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
+	//B should be in the same physical order, just transpose A in place and the LU result on output
+	for(int i=1; i<N; ++i) for(int j=0; j<i; ++j)  {double t=A[j*lda+i]; A[j*lda+i]=A[i*lda+j]; A[i*lda+j]=t;}
 #ifdef FORINT
-const FINT ntmp=N;
-const FINT nrhstmp=NRHS;
-const FINT ldatmp=lda;
-const FINT ldbtmp=ldb;
-FINT ipivtmp=*ipiv;
-FORNAME(dgesv) (&ntmp,&nrhstmp,A,&ldatmp,&ipivtmp,B,&ldbtmp,&INFO);
+	const FINT ntmp=N;
+	const FINT nrhstmp=NRHS;
+	const FINT ldatmp=lda;
+	const FINT ldbtmp=ldb;
+	FINT ipivtmp=*ipiv;
+	FORNAME(dgesv) (&ntmp,&nrhstmp,A,&ldatmp,&ipivtmp,B,&ldbtmp,&INFO);
 #else
-FORNAME(dgesv) (&N,&NRHS,A,&lda,ipiv,B,&ldb,&INFO);
+	FORNAME(dgesv) (&N,&NRHS,A,&lda,ipiv,B,&ldb,&INFO);
 #endif
-for(int i=1; i<N; ++i) for(int j=0; j<i; ++j)  {double t=A[j*lda+i]; A[j*lda+i]=A[i*lda+j]; A[i*lda+j]=t;}
-return (int)INFO;
+	for(int i=1; i<N; ++i) for(int j=0; j<i; ++j)  {double t=A[j*lda+i]; A[j*lda+i]=A[i*lda+j]; A[i*lda+j]=t;}
+	return (int)INFO;
 }

 #endif

-void cblas_dswap(const int N, double *X, const int incX,
-                 double *Y, const int incY){
-  LA::laerror("cblas_dswap is not available... (-DNONCBLAS)");
+
+extern "C" void FORNAME(dswap)(const FINT *N, double *X, const FINT *incX, double *Y, const FINT *incY);
+void cblas_dswap(const int N, double *X, const int incX, double *Y, const int incY){
+#ifdef FORINT
+	const FINT N_tmp = N;
+	const FINT incX_tmp = incX;
+	const FINT incY_tmp = incY;
+	FORNAME(dswap)(&N_tmp, X, &incX_tmp, Y, &incY_tmp);
+#else
+	FORNAME(dswap)(&N, X, &incX, Y, &incY);
+#endif
 }

-void cblas_zswap(const int N, void *X, const int incX,
-                 void *Y, const int incY){
-  LA::laerror("cblas_zswap is not available... (-DNONCBLAS)");
+extern "C" void FORNAME(zswap)(const FINT *N, void *X, const FINT *incX, void *Y, const FINT *incY);
+void cblas_zswap(const int N, void *X, const int incX, void *Y, const int incY){
+#ifdef FORINT
+	const FINT N_tmp = N;
+	const FINT incX_tmp = incX;
+	const FINT incY_tmp = incY;
+	FORNAME(zswap)(&N_tmp, X, &incX_tmp, Y, &incY_tmp);
+#else
+	FORNAME(zswap)(&N, X, &incX, Y, &incY);
+#endif
 }
--- a/noncblas.h
+++ b/noncblas.h
@@ -85,6 +85,11 @@ CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX);
 CBLAS_INDEX cblas_icamax(const int N, const void   *X, const int incX);
 CBLAS_INDEX cblas_izamax(const int N, const void   *X, const int incX);

+/*
+CBLAS_INDEX cblas_idamin(const int N, const double *X, const int incX);
+CBLAS_INDEX cblas_izamin(const int N, const void   *X, const int incX);
+*/
+
 /*
 * ===========================================================================
 * Prototypes for level 1 BLAS routines
--- a/smat.cc
+++ b/smat.cc
@@ -1,3 +1,6 @@
+//------------------------------------------------------------------------------
+/* vim: set ts=8 sw=8 sts=8 noexpandtab cindent: */
+//------------------------------------------------------------------------------
 /*
    LA: linear algebra C++ interface library
    Copyright (C) 2008 Jiri Pittner <jiri.pittner@jh-inst.cas.cz> or <jiri@pittnerovi.com>
@@ -25,219 +28,359 @@
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <errno.h>
+
 extern "C" {
-extern ssize_t read(int, void *, size_t);
-extern ssize_t write(int, const void *, size_t);
+	extern ssize_t read(int, void *, size_t);
+	extern ssize_t write(int, const void *, size_t);
 }
-// TODO
-// specialize unary minus

 namespace LA {

-
-/*
- *  * Templates first, specializations for BLAS next
- *
- */
-
-//raw I/O
+/***************************************************************************//**
+ * routine for raw output 
+ * @param[in] fd file descriptor for output
+ * @param[in] dim number of elements intended for output
+ * @param[in] transp reserved
+ * @see NRMat<T>::get(), NRSMat<T>::copyonwrite()
+ ******************************************************************************/
 template <typename T>
-void NRSMat<T>::put(int fd, bool dim, bool transp) const
-{
+void NRSMat<T>::put(int fd, bool dim, bool transp) const {
 #ifdef CUDALA
-if(location!=cpu)
-        {
-        NRSMat<T> tmp= *this;
-        tmp.moveto(cpu);
-        tmp.put(fd,dim,transp);
-        return;
-        }
+	if(location != cpu){
+		NRSMat<T> tmp= *this;
+		tmp.moveto(cpu);
+		tmp.put(fd,dim,transp);
+		return;
+	}
+#endif
+	errno = 0;
+	if(dim){
+		if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write");
+		if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write");
+	}
+	LA_traits<T>::multiput(NN2,fd,v,dim);
+}
+
+/***************************************************************************//**
+ * routine for raw input 
+ * @param[in] fd file descriptor for input
+ * @param[in] dim number of elements intended for input
+ * @param[in] transp reserved
+ * @see NRSMat<T>::put(), NRSMat<T>::copyonwrite()
+ ******************************************************************************/
+template <typename T>
+void NRSMat<T>::get(int fd, bool dim, bool transp) {
+#ifdef CUDALA
+	if(location != cpu){
+		NRSMat<T> tmp;
+		tmp.moveto(cpu);
+		tmp.get(fd,dim,transp);
+		tmp.moveto(location);
+		*this = tmp;
+		return;
+	}
 #endif

-errno=0;
-if(dim)
-{
-if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write");
-if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write");
-}
-LA_traits<T>::multiput(NN2,fd,v,dim);
+	int nn0[2]; //align at least 8-byte
+	errno = 0;
+	if(dim){
+		if(2*sizeof(int) != read(fd,&nn0,2*sizeof(int))) laerror("cannot read");
+		resize(nn0[0]);
+	}else{
+		copyonwrite();
+	}
+	LA_traits<T>::multiget(NN2,fd,v,dim);
 }

+
+/***************************************************************************//**
+ * constructor symmetrizing given matrix \f$A\f$ of general type <code>T</code> yielding \f$(A+A^\mathrm{T})/2\f$
+ * @param[in] rhs matrix \f$A\f$
+ ******************************************************************************/
 template <typename T>
-void NRSMat<T>::get(int fd, bool dim, bool transp)
-{
-#ifdef CUDALA
-if(location!=cpu)
-        {
-        NRSMat<T> tmp;
-        tmp.moveto(cpu);
-        tmp.get(fd,dim,transp);
-        tmp.moveto(location);
-        *this = tmp;
-        return;
-        }
-#endif
+NRSMat<T>::NRSMat(const NRMat<T> &rhs) {
+	NOT_GPU(rhs);

-int nn0[2]; //align at least 8-byte
-errno=0;
-if(dim)
-{
-if(2*sizeof(int) != read(fd,&nn0,2*sizeof(int))) laerror("cannot read");
-resize(nn0[0]);
-}
-else 
-copyonwrite();
-LA_traits<T>::multiget(NN2,fd,v,dim);
-}
-
-
-// conversion ctor, symmetrize general Mat into SMat
-template <typename T>
-NRSMat<T>::NRSMat(const NRMat<T> &rhs)
-{
-nn=rhs.nrows();
+	nn = rhs.nrows();
 #ifdef DEBUG
-	if (nn != rhs.ncols()) laerror("attempt to convert non-square Mat to SMat");
+	if(nn != rhs.ncols()) laerror("attempt to convert nonsquare NRMat<T> to NRSMat<T>");
+#endif
+#ifdef CUDALA
+	location = rhs.getlocation();
 #endif
 	count = new int;
 	*count = 1;
 	v = new T[NN2];
-	int i, j, k=0;
-	for (i=0; i<nn; i++)
-		for (j=0; j<=i;j++) v[k++] = (rhs[i][j] + rhs[j][i])/((T)2);
+	int i, j, k(0);
+	for(i=0; i<nn; i++){
+		for(j=0; j<=i; j++){
+			v[k++] = (rhs[i][j] + rhs[j][i])/((T)2);
+		}
+	}
 }

-
-
-// assign to diagonal
+/***************************************************************************//**
+ * zero out this symmetric matrix of general type <code>T</code> and then set
+ * the diagonal elements to prescribed value
+ * @param[in] a scalar value to be assigned to the diagonal
+ * @return reference to the modified matrix
+ ******************************************************************************/
 template <typename T>
-NRSMat<T> & NRSMat<T>::operator=(const T &a)
-{
+NRSMat<T> & NRSMat<T>::operator=(const T &a) {
+	NOT_GPU(*this);
 	copyonwrite();
-	memset(v,0,NN2*sizeof(T));
-	for (int i=0; i<nn; i++) v[i*(i+1)/2+i] = a;
+	memset(v, 0, NN2*sizeof(T));
+	for(register int i=0; i<nn; i++) v[i*(i+1)/2 + i] = a;
 	return *this;
 }

-//get diagonal
+/***************************************************************************//**
+ * get or divide by the diagonal of real symmetric double-precision matrix
+ * @param[in, out] r vector for storing the diagonal
+ * @param[in] divide
+ * 	\li \c false save the diagonal to vector r
+ *	\li \c true divide the vector r by the diagonal elements element-wise
+ * @param[in] cache reserved
+ * @return
+ * 	\li <tt>divide == true</tt> NULL
+ *	\li <tt>divide == false</tt> pointer to the first element of r
+ ******************************************************************************/
 template <typename T>
-const T* NRSMat<T>::diagonalof(NRVec<T> &r, const bool divide, bool cache) const
-{
+const T* NRSMat<T>::diagonalof(NRVec<T> &r, const bool divide, bool cache) const {
 #ifdef DEBUG
-if(r.size()!=nn) laerror("incompatible vector in diagonalof()");
+	if(r.size() != nn) laerror("incompatible vector in const T* NRSMat<T>::diagonalof(NRVec<T> &, const bool, bool)");
 #endif
+	NOT_GPU(*this);
+	SAME_LOC(*this, r);

-r.copyonwrite();
+	r.copyonwrite();

-if (divide)
-	for (int i=0; i<nn; i++) {T a =v[i*(i+1)/2+i]; if(a!=0.) r[i] /= a;}
-else
-        for (int i=0; i<nn; i++) r[i] = v[i*(i+1)/2+i];
-return divide?NULL:&r[0];
+	if(divide){
+		for(register int i=0; i<nn; i++){
+			const T a = v[i*(i+1)/2+i];
+			if(a != 0.) r[i] /= a;
+		}
+	}else{
+	        for(register int i=0; i<nn; i++) r[i] = v[i*(i+1)/2+i];
+	}
+
+	return divide?NULL:&r[0];
 }


-// unary minus
+/***************************************************************************//**
+ * implements unary minus operator for this symmetric
+ * matrix of general type <code>T</code>
+ * @return modified copy of this matrix
+ ******************************************************************************/
 template <typename T>
-const NRSMat<T> NRSMat<T>::operator-() const
-{
-	NRSMat<T> result(nn);
-	for(int i=0; i<NN2; i++) result.v[i]= -v[i];
+const NRSMat<T> NRSMat<T>::operator-() const {
+	NOT_GPU(*this);
+
+	NRSMat<T> result(nn, getlocation());
+	for(register int i = 0; i<NN2; i++) result.v[i]= -v[i];
 	return result;
 }

-// trace of Smat
+/***************************************************************************//**
+ * implements unary minus operator for this real symmetric matrix
+ * @return modified copy of this matrix
+ ******************************************************************************/
+template <>
+const NRSMat<double> NRSMat<double>::operator-() const {
+	NRSMat<double> result(nn, getlocation());
+#ifdef CUDALA
+	if(location == cpu){
+#endif
+		memcpy(result.v, v, NN2*sizeof(double));
+		cblas_dscal(NN2, -1., result.v, 1);
+#ifdef CUDALA
+	}else{
+		cublasDcopy(NN2, v, 1, result.v, 1);
+		TEST_CUBLAS("cublasDcopy");
+
+		cublasDscal(NN2, -1., result.v, 1);
+		TEST_CUBLAS("cublasDscal");
+	}
+#endif
+	return result;
+}
+
+/***************************************************************************//**
+ * implements unary minus operator for this hermitian matrix
+ * @return modified copy of this matrix
+ ******************************************************************************/
+template <>
+const NRSMat<complex<double> > NRSMat<complex<double> >::operator-() const {
+	NRSMat<complex<double> > result(nn, getlocation());
+#ifdef CUDALA
+        if(location == cpu) {
+#endif
+		memcpy(result.v, v, NN2*sizeof(complex<double>));
+		cblas_zscal(NN2, &CMONE, result.v, 1);
+
+#ifdef CUDALA
+        }else{
+		cublasZcopy(NN2, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)result.v, 1);
+		TEST_CUBLAS("cublasZcopy");
+
+		cublasZscal(NN2, CUMONE, (cuDoubleComplex*)result.v, 1);
+		TEST_CUBLAS("cublasZscal");
+	}
+#endif
+	return result;
+}
+
+/***************************************************************************//**
+ * @return the sum of the diagonal elements
+ ******************************************************************************/
 template <typename T>
-const T NRSMat<T>::trace() const
-{
+const T NRSMat<T>::trace() const {
+	NOT_GPU(*this);
+
 	T tmp = 0;
-	for (int i=0; i<nn; i++) tmp += v[i*(i+1)/2+i];
+	for(register int i=0; i<nn; i++) tmp += v[i*(i+1)/2+i];
 	return tmp;
 }

+/***************************************************************************//**
+ * fill this real symmetric matrix with
+ * pseudorandom numbers generated from uniform distribution
+ ******************************************************************************/
 template<>
-void NRSMat<double>::randomize(const double &x)
-{
-for(int i=0; i<NN2; ++i) v[i] = x*(2.*random()/(1.+RAND_MAX) -1.);
+void NRSMat<double>::randomize(const double &x) {
+	NOT_GPU(*this);
+
+	for(int i=0; i<NN2; ++i){
+		v[i] = x*(2.*random()/(1.+RAND_MAX) -1.);
+	}
 }

+/***************************************************************************//**
+ * Fill this hermitian matrix with pseudorandom numbers generated from uniform 
+ * distribution. The real and imaginary parts are generated independently.
+ ******************************************************************************/
 template<>
-void NRSMat<complex<double> >::randomize(const double &x)
-{
-for(int i=0; i<NN2; ++i) v[i].real() = x*(2.*random()/(1.+RAND_MAX) -1.);
-for(int i=0; i<NN2; ++i) v[i].imag() = x*(2.*random()/(1.+RAND_MAX) -1.);
-for(int i=0; i<nn; ++i) for(int j=0; j<=i; ++j) if(i==j) v[i*(i+1)/2+j].imag()=0; //hermitean 
+void NRSMat<complex<double> >::randomize(const double &x) {
+	for(register int i=0; i<NN2; ++i) v[i].real() = x*(2.*random()/(1. + RAND_MAX) -1.);
+	for(register int i=0; i<NN2; ++i) v[i].imag() = x*(2.*random()/(1. + RAND_MAX) -1.);
+	for(register int i=0; i<nn; ++i){
+		for(register int j=0; j<=i; ++j){
+		       	if(i == j) v[i*(i+1)/2+j].imag() = 0; //hermitean
+		}
+	}
 }

-
-
-// write matrix to the file with specific format
+/***************************************************************************//**
+ * routine for formatted output via lawritemat
+ * @param[in] file pointer to <tt>FILE</tt> structure representing the output file
+ * @param[in] format format specification in standard printf-like form
+ * @param[in] modulo
+ * @see lawritemat()
+ ******************************************************************************/
 template <typename T>
-void NRSMat<T>::fprintf(FILE *file, const char *format, const int modulo) const
-{
+void NRSMat<T>::fprintf(FILE *file, const char *format, const int modulo) const {
+	NOT_GPU(*this);
+
 	lawritemat(file, (const T *)(*this) ,nn, nn, format, 2, modulo, 1);
 }

-// read matrix from the file with specific format
+
+/***************************************************************************//**
+ * routine for formatted input via fscanf
+ * @param[in] f pointer to <tt>FILE</tt> structure representing the input file
+ * @param[in] format format specification in standard printf-like form
+ ******************************************************************************/
 template <typename T>
-void NRSMat<T>::fscanf(FILE *f, const char *format)
-{
+void NRSMat<T>::fscanf(FILE *f, const char *format) {
 	int n, m;
-	if (::fscanf(f,"%d %d",&n,&m) != 2)
-		laerror("cannot read matrix dimensions in SMat::fscanf");
-	if (n != m) laerror("different dimensions of SMat");
+	NOT_GPU(*this);
+
+	if (::fscanf(f,"%d %d", &n, &m) != 2)
+		laerror("cannot read matrix dimensions in NRSMat<T>::fscanf(FILE *, const char *)");
+	if (n != m) laerror("different dimensions in NRSMat<T>::fscanf(FILE *, const char *)");
 	resize(n);
 	for (int i=0; i<n; i++) 
 		for (int j=0; j<n; j++)
 			if (::fscanf(f,format,&((*this)(i,j))) != 1)
-				laerror("Smat - cannot read matrix element");
+				laerror("NRSMat<T>::fscanf(FILE *, const char *) - unable to read matrix element");
 }


-/*
- * BLAS specializations for double and complex<double>
- */
-
-
-
-// SMat * Mat
-//NOTE: dsymm is not appropriate as it works on UNPACKED symmetric matrix
+/***************************************************************************//**
+ * multiply this real double-precision symmetric matrix \f$S\f$ stored in packed form
+ * with real double-precision dense matrix \f$A\f$
+ * @param[in] rhs real double-precision matrix \f$A\f$
+ * @return matrix produt \f$S\times{}A\f$
+ ******************************************************************************/
 template<>
-const NRMat<double> NRSMat<double>::operator*(const NRMat<double> &rhs) const
-{
+const NRMat<double> NRSMat<double>::operator*(const NRMat<double> &rhs) const {
 #ifdef DEBUG
-	if (nn != rhs.nrows()) laerror("incompatible dimensions in SMat*Mat");
+	if(nn != rhs.nrows()) laerror("incompatible dimensions in NRMat<double> NRSMat<double>::operator*(const NRMat<double> &)");
+#endif
+	SAME_LOC(*this, rhs);
+	NRMat<double> result(nn, rhs.ncols(), getlocation());
+#ifdef CUDALA
+	if(location == cpu){
+#endif
+		for(register int k = 0; k<rhs.ncols(); k++){
+			cblas_dspmv(CblasRowMajor, CblasLower, nn, 1.0, v, rhs[0] + k, rhs.ncols(), 0.0, result[0] + k, rhs.ncols());
+		}
+#ifdef CUDALA
+	}else{
+		for(register int k = 0; k<rhs.ncols(); k++){
+			cublasDspmv('U', nn, 1.0, v, rhs[0] + k, rhs.ncols(), 0.0, result[0] + k, rhs.ncols());
+			TEST_CUBLAS("cublasDspmv");
+		}
+	}
 #endif
-	NRMat<double> result(nn, rhs.ncols());
-	for (int k=0; k<rhs.ncols(); k++)
-		cblas_dspmv(CblasRowMajor, CblasLower, nn, 1.0, v, rhs[0]+k, rhs.ncols(),
-				0.0, result[0]+k, rhs.ncols());
 	return result;
 }


+/***************************************************************************//**
+ * multiply this real double-precision symmetric matrix \f$S\f$ stored in packed form
+ * with real double-precision dense matrix \f$A\f$
+ * @param[in] rhs real double-precision matrix \f$A\f$
+ * @return matrix produt \f$S\times{}A\f$
+ ******************************************************************************/
 template<>
-const NRMat< complex<double> >
-NRSMat< complex<double> >::operator*(const NRMat< complex<double> > &rhs) const
-{
+const NRMat<complex<double> >
+NRSMat<complex<double> >::operator*(const NRMat<complex<double> > &rhs) const {
 #ifdef DEBUG
-	if (nn != rhs.nrows()) laerror("incompatible dimensions in SMat*Mat");
+	if (nn != rhs.nrows()) laerror("incompatible dimensions in NRSMat<complex<double> >::operator*(const NRMat<complex<double> > &)");
+#endif
+	SAME_LOC(*this, rhs);
+	NRMat<complex<double> > result(nn, rhs.ncols(), getlocation());
+#ifdef CUDALA
+	if(location == cpu){
+#endif
+		for(register int k=0; k<rhs.ncols(); k++){
+			cblas_zhpmv(CblasRowMajor, CblasLower, nn, &CONE, v, rhs[0]+k, rhs.ncols(), &CZERO, result[0]+k, rhs.ncols());
+		}
+#ifdef CUDALA
+	}else{
+		for(register int k = 0; k<rhs.ncols(); k++){
+			cublasZhpmv('U', nn, 
+					CUONE, (cuDoubleComplex*)v, (cuDoubleComplex*)(rhs[0] + k), rhs.ncols(),
+					CUZERO, (cuDoubleComplex*)(result[0] + k), rhs.ncols());
+
+			TEST_CUBLAS("cublasDspmv");
+		}
+	}
 #endif
-	NRMat< complex<double> > result(nn, rhs.ncols());
-	for (int k=0; k<rhs.ncols(); k++)
-		cblas_zhpmv(CblasRowMajor, CblasLower, nn, &CONE, v, rhs[0]+k, rhs.ncols(),
-				&CZERO, result[0]+k, rhs.ncols());
 	return result;
 }

-
-
-// SMat * SMat
+/***************************************************************************//**
+ * multiply this real double-precision symmetric matrix \f$S\f$ stored in packed form
+ * with real double-precision symmetric matrix \f$T\f$
+ * @return matrix produt \f$S\times{}T\f$ (not necessarily symmetric)
+ ******************************************************************************/
 template<>
-const NRMat<double> NRSMat<double>::operator*(const NRSMat<double> &rhs) const
-{
+const NRMat<double> NRSMat<double>::operator*(const NRSMat<double> &rhs) const {
 #ifdef DEBUG
-	if (nn != rhs.nn) laerror("incompatible dimensions in SMat*SMat");
+	if (nn != rhs.nn) laerror("incompatible dimensions in NRMat<double> NRSMat<double>::operator*(const NRSMat<double> &)");
 #endif
 	NRMat<double> result(0.0, nn, nn);
 	double *p, *q;
@@ -283,156 +426,295 @@ const NRMat<double> NRSMat<double>::operator*(const NRSMat<double> &rhs) const
 }


-
+/***************************************************************************//**
+ * multiply this complex double-precision symmetric matrix \f$G\f$ stored in packed form
+ * with complex double-precision symmetric matrix \f$H\f$
+ * @return matrix produt \f$G\times{}H\f$ (not necessarily symmetric)
+ ******************************************************************************/
 template<>
-const NRMat< complex<double> > 
-NRSMat< complex<double> >::operator*(const NRSMat< complex<double> > &rhs) const
-{
+const NRMat<complex<double> > 
+NRSMat<complex<double> >::operator*(const NRSMat<complex<double> > &rhs) const {
 #ifdef DEBUG
-	if (nn != rhs.nn) laerror("incompatible dimensions in SMat*SMat");
+	if (nn != rhs.nn) laerror("incompatible dimensions in NRSMat<complex<double> >::operator*(const NRSMat<complex<double> > &)");
 #endif
-	NRMat< complex<double> > result(0.0, nn, nn);
-	NRMat< complex<double> > rhsmat(rhs);
+	SAME_LOC(*this, rhs);
+	NRMat<complex<double> > result(nn, nn, getlocation());
+	NRMat<complex<double> > rhsmat(rhs);
 	result = *this * rhsmat;
 	return result;
-//	laerror("complex SMat*Smat not implemented");
 }


-
-
-// S dot S
+/***************************************************************************//**
+ * compute inner product of this real symmetric matrix \f$A\f$ with given real symmetric matrix \f$B\f$
+ * i.e. determine the value of
+ * \f[\sum_{i,j}A_{i,j}B_{i,j}\f]
+ * @param[in] rhs matrix \f$B\f$
+ * @return computed inner product
+ ******************************************************************************/
 template<>
-const double NRSMat<double>::dot(const NRSMat<double> &rhs) const
-{
+const double NRSMat<double>::dot(const NRSMat<double> &rhs) const {
+	double ret(0.);
 #ifdef DEBUG
-	if (nn != rhs.nn) laerror("dot of incompatible SMat's");
+	if (nn != rhs.nn) laerror("incompatible dimensions in double NRSMat<double>::dot(const NRSMat<double> &)");
 #endif
-	return cblas_ddot(NN2, v, 1, rhs.v, 1);
+	SAME_LOC(*this, rhs);
+#ifdef CUDALA
+	if(location == cpu){
+#endif
+		ret = cblas_ddot(NN2, v, 1, rhs.v, 1);
+#ifdef CUDALA
+	}else{
+		ret = cublasDdot(NN2, v, 1, rhs.v, 1);
+	}
+#endif
+	return ret;
 }


-
+/***************************************************************************//**
+ * compute inner product of this complex symmetric matrix \f$A\f$ with given complex symmetric matrix \f$B\f$
+ * i.e. determine the value of
+ * \f[\sum_{i,j}\overbar{A_{i,j}}B_{i,j}\f]
+ * @param[in] rhs matrix \f$B\f$
+ * @return computed inner product
+ ******************************************************************************/
 template<>
-const complex<double> 
-NRSMat< complex<double> >::dot(const NRSMat< complex<double> > &rhs) const
-{
+const complex<double> NRSMat<complex<double> >::dot(const NRSMat<complex<double> > &rhs) const {
 #ifdef DEBUG
-	if (nn != rhs.nn) laerror("dot of incompatible SMat's");
+	if (nn != rhs.nn) laerror("incompatible dimensions in complex<double> NRSMat<complex<double> >::dot(const NRSMat<complex<double> > &)");
+#endif
+	complex<double> dot(0., 0.);
+	SAME_LOC(*this, rhs);
+
+#ifdef CUDALA
+	if(location == cpu){
+#endif
+		cblas_zdotc_sub(NN2, v, 1, rhs.v, 1, &dot);
+#ifdef CUDALA
+	}else{
+		const cuDoubleComplex _dot = cublasZdotc(NN2, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)(rhs.v), 1);
+		dot = complex<double>(cuCreal(_dot), cuCimag(_dot));
+		TEST_CUBLAS("cublasZdotc");
+	}
 #endif
-	complex<double> dot;
-	cblas_zdotc_sub(NN2, v, 1, rhs.v, 1, &dot);
 	return dot;
 }


+/***************************************************************************//**
+ * compute inner product of this real double-precision symmetric matrix \f$S\f$ of order \f$n\f$
+ * with given real double-precision vector \f$\vec{v}\f$ of length \f$n(n+1)/2\f$
+ * @param[in] rhs real double-precision vector \f$\vec{v}\f$
+ * @return computed inner product
+ ******************************************************************************/
 template<>
-const double NRSMat<double>::dot(const NRVec<double> &rhs) const
-{
+const double NRSMat<double>::dot(const NRVec<double> &rhs) const {
+	double ret(0.0);
 #ifdef DEBUG
-	if (NN2 != rhs.nn) laerror("dot of incompatible SMat's");
+	if(NN2 != rhs.nn) laerror("incompatible dimensions in double NRSMat<double>::dot(const NRVec<double> &)");
+#endif
+	SAME_LOC(*this, rhs);
+#ifdef CUDALA
+	if(location == cpu){
+#endif
+		ret = cblas_ddot(NN2, v, 1, rhs.v, 1);
+#ifdef CUDALA
+	}else{
+		ret = cublasDdot(NN2, v, 1, rhs.v, 1);
+		TEST_CUBLAS("cublasDdot");
+	}
 #endif
-	return cblas_ddot(NN2, v, 1, rhs.v, 1);
 }


-
+/***************************************************************************//**
+ * compute inner product of this complex double-precision hermitian matrix \f$H\f$ of order \f$n\f$
+ * with given complex double-precision vector \f$\vec{v}\f$ of length \f$n(n+1)/2\f$
+ * @param[in] rhs complex double-precision vector \f$\vec{v}\f$
+ * @return computed inner product
+ ******************************************************************************/
 template<>
 const complex<double> 
-NRSMat< complex<double> >::dot(const NRVec< complex<double> > &rhs) const
-{
+NRSMat<complex<double> >::dot(const NRVec<complex<double> > &rhs) const {
 #ifdef DEBUG
-	if (NN2 != rhs.nn) laerror("dot of incompatible SMat's");
+	if(NN2 != rhs.nn) laerror("incompatible dimensions in complex<double>  NRSMat<complex<double> >::dot(const NRVec<complex<double> > &)");
+#endif
+	complex<double> dot(0., 0.);
+	SAME_LOC(*this, rhs);
+#ifdef CUDALA
+	if(location == cpu){
+#endif
+		cblas_zdotc_sub(NN2, v, 1, rhs.v, 1, &dot);
+#ifdef CUDALA
+	}else{
+		const cuDoubleComplex _dot = cublasZdotc(NN2, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)rhs.v, 1);
+		TEST_CUBLAS("cublasZdotc");
+		dot = complex<double>(cuCreal(_dot), cuCimag(_dot));
+	}
 #endif
-	complex<double> dot;
-	cblas_zdotc_sub(NN2, v, 1, rhs.v, 1, &dot);
 	return dot;
 }

-
-// norm of the matrix
+/***************************************************************************//**
+ * compute the Frobenius norm of this real double-precision symmetric  matrix
+ * @param[in] scalar subtract this scalar value from the diagonal elements before the norm computation
+ ******************************************************************************/
 template<>
-const double  NRSMat<double>::norm(const double scalar) const
-{
-	if (!scalar) return cblas_dnrm2(NN2, v, 1);
-	double sum = 0;
-	int k = 0;
-	for (int i=0; i<nn; ++i)
-		for (int j=0; j<=i; ++j) {
-			register double tmp;
-			tmp = v[k++];
-			if (i == j) tmp -= scalar;
+const double NRSMat<double>::norm(const double scalar) const {
+	if(!scalar){
+		double ret(0.);
+#ifdef CUDALA
+		if(location == cpu){
+#endif
+			ret = cblas_dnrm2(NN2, v, 1);
+#ifdef CUDALA
+		}else{
+			ret = cublasDnrm2(NN2, v, 1);
+			TEST_CUBLAS("cublasDnrm2");
+		}
+#endif
+		return ret;
+	}
+
+	NOT_GPU(*this);
+
+	double sum(0.);
+	int k(0);
+	for(register int i=0; i<nn; ++i){
+		for(register int j=0; j<=i; ++j) {
+			register double tmp = v[k++];
+			if(i == j) tmp -= scalar;
 			sum += tmp*tmp;
 		}
+	}
 	return std::sqrt(sum);
 }

-
-
+/***************************************************************************//**
+ * compute the Frobenius norm of this complex double-precision hermitian  matrix
+ * @param[in] scalar subtract this scalar value from the diagonal elements before the norm computation
+ ******************************************************************************/
 template<>
-const double NRSMat< complex<double> >::norm(const complex<double> scalar) const
-{
-	if (!(scalar.real()) && !(scalar.imag()))
-		return cblas_dznrm2(NN2, v, 1);
-	double sum = 0;
+const double NRSMat< complex<double> >::norm(const complex<double> scalar) const {
+	if(!(scalar.real()) && !(scalar.imag())){
+		double ret(0.);
+#ifdef CUDALA
+		if(location == cpu){
+#endif
+			ret = cblas_dznrm2(NN2, v, 1);
+#ifdef CUDALA
+		}else{
+			ret = cublasDznrm2(NN2, (cuDoubleComplex*)v, 1);
+			TEST_CUBLAS("cublasDznrm2");
+		}
+#endif
+		return ret;
+	}
+
+	int k(0);
+	double sum(0.);
 	complex<double> tmp;
-	int k = 0;
-	for (int i=0; i<nn; ++i)
-		for (int j=0; j<=i; ++j) {
+
+	for(register int i=0; i<nn; ++i){
+		for(register int j=0; j<=i; ++j){
 			tmp = v[k++];
 			if (i == j) tmp -= scalar;
 			sum += tmp.real()*tmp.real() + tmp.imag()*tmp.imag();
 		}
+	}
 	return std::sqrt(sum);
 }


-
-
-
-// axpy: S = S * a
+/***************************************************************************//**
+ * for this real double-precision symmetric matrix \f$S\f$ stored in packed form,
+ * real scalar value \f$\alpha\f$ and real double-precision symmetric matrix \f$T\f$, compute
+ * \f[S \leftarrow \alpha T + S\f]
+ ******************************************************************************/
 template<>
-void NRSMat<double>::axpy(const double alpha, const NRSMat<double> & x)
-{
+void NRSMat<double>::axpy(const double alpha, const NRSMat<double> &x) {
 #ifdef DEBUG
-	if (nn != x.nn) laerror("axpy of incompatible SMats");
+	if(nn != x.nn) laerror("incompatible dimensions in void NRSMat<double>::axpy(const double, const NRSMat<double>&)");
 #endif
+	SAME_LOC(*this, x);
 	copyonwrite();
-	cblas_daxpy(NN2, alpha, x.v, 1, v, 1);
+#ifdef CUDALA
+	if(location == cpu){
+#endif
+		cblas_daxpy(NN2, alpha, x.v, 1, v, 1);
+#ifdef CUDALA
+	}else{
+		cublasDaxpy(NN2, alpha, x.v, 1, v, 1);
+		TEST_CUBLAS("cublasDaxpy");
+	}
+#endif
 }


-
+/***************************************************************************//**
+ * for this complex double-precision hermitian matrix \f$H\f$ stored in packed form,
+ * complex scalar value \f$\alpha\f$ and complex double-precision hermitian matrix \f$G\f$, compute
+ * \f[H \leftarrow \alpha G + H\f]
+ ******************************************************************************/
 template<>
-void NRSMat< complex<double> >::axpy(const complex<double> alpha,
-			const NRSMat< complex<double> > & x)
-{
+void NRSMat<complex<double> >::axpy(const complex<double> alpha, const NRSMat<complex<double> > & x) {
 #ifdef DEBUG
-	if (nn != x.nn) laerror("axpy of incompatible SMats");
+	if(nn != x.nn) laerror("incompatible dimensions in void NRSMat<complex<double> >::axpy(const complex<double> , const NRSMat<complex<double> >&)");
 #endif
+	SAME_LOC(*this, x);
 	copyonwrite();
-	cblas_zaxpy(nn, &alpha, x.v, 1, v, 1);
+#ifdef CUDALA
+	if(location == cpu){
+#endif
+		cblas_zaxpy(nn, &alpha, x.v, 1, v, 1);
+#ifdef CUDALA
+	}else{
+		const cuDoubleComplex _alpha = make_cuDoubleComplex(alpha.real(), alpha.imag());
+		cublasZaxpy(NN2, _alpha, (cuDoubleComplex*)x.v, 1, (cuDoubleComplex*)v, 1);
+		TEST_CUBLAS("cublasZaxpy");
+	}
+#endif
+	
 }

-//complex from real
+/***************************************************************************//**
+ * create hermitian matrix \f$H\f$ from given real double-precision symmetric
+ * matrix \f$S\f$
+ * @param[in] rhs real double-precision symmetric matrix \f$S\f$
+ * @param[in] imagpart flag determining whether \f$S\f$ should correspond to the real or imaginary part of \f$H\f$
+ ******************************************************************************/
 template<>
-NRSMat<complex<double> >::NRSMat(const NRSMat<double> &rhs, bool imagpart)
-: nn(rhs.nrows()),  v(new complex<double>[rhs.nrows()*(rhs.nrows()+1)/2]), count(new int(1))
-{
-memset(v,0,nn*(nn+1)/2*sizeof(complex<double>));
-cblas_dcopy(nn*(nn+1)/2,&rhs(0,0),1,((double *)v) + (imagpart?1:0),2);
+NRSMat<complex<double> >::NRSMat(const NRSMat<double> &rhs, bool imagpart): nn(rhs.nrows()), count(new int(1)) {
+	//inconsistent in general case?
+	const int nnp1 = nn*(nn + 1)/2;
+#ifdef CUDALA
+	location = rhs.getlocation();
+	if(location == cpu){
+#endif
+		v = new complex<double>[nnp1];
+		memset(v, 0, nnp1*sizeof(complex<double>));
+		cblas_dcopy(nnp1, &rhs(0, 0), 1, ((double *)v) + (imagpart?1:0), 2);
+#ifdef CUDALA
+	}else{
+		v = (complex<double>*) gpualloc(nnp1*sizeof(complex<double>));
+
+		complex<double> *_val = gpuputcomplex(CZERO);
+		cublasZcopy(nnp1, (cuDoubleComplex*)_val, 0, (cuDoubleComplex*)v, 1);
+		TEST_CUBLAS("cublasZcopy");
+		gpufree(_val);
+
+		cublasDcopy(nnp1, (double*)(&rhs(0,0)), 1, ((double*)v) + (imagpart?1:0), 2);
+		TEST_CUBLAS("cublasDcopy"); 
+	}
+#endif
 }

-
-//some template specializations leading to BLAS/CUBLAS calls
-
-
-
-
-//////////////////////////////////////////////////////////////////////////////
-////// forced instantization in the corresponding object file
+/***************************************************************************//**
+ * forced instantization in the corresponding object file
+ ******************************************************************************/
 template class NRSMat<double>;
-template class NRSMat< complex<double> >;
+template class NRSMat<complex<double> >;

 template class NRSMat<long long>;
 template class NRSMat<long>;
--- a/smat.h
+++ b/smat.h
--- a/vec.cc
+++ b/vec.cc
--- a/vec.h
+++ b/vec.h