cuda

2010-09-08 16:27:58 +00:00
parent e8cbf9e5fb
commit e580467e5a
14 changed files with 7106 additions and 3691 deletions
--- a/cuda_la.cc
+++ b/cuda_la.cc
@@ -1,3 +1,6 @@
 /* vim: set ts=8 sw=8 sts=8 noexpandtab cindent: */
 /*******************************************************************************
 *******************************************************************************/
 #include "la_traits.h"
 #include "cuda_la.h"
@@ -7,64 +10,55 @@ namespace LA {
 GPUID DEFAULT_LOC = cpu;
-void set_default_loc(const GPUID loc) 
+void set_default_loc(const GPUID loc){
 {
 	DEFAULT_LOC = loc;
 }
-void *gpualloc(size_t size)
+void *gpualloc(size_t size){
 {
 cublasStatus status;
 	void *ptr = NULL;
-status = cublasAlloc(size,1,&ptr);
+	cublasAlloc(size, 1, &ptr);
-if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc");
+	TEST_CUBLAS("cublasAlloc");
 	return ptr;
 }
-void gpufree(void *ptr)
+void gpufree(void *ptr){
-{
+	cublasFree(ptr);
-cublasStatus status = cublasFree(ptr);
+	TEST_CUBLAS("cublasFree");
 if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasFree");
 }
-void gpuget(size_t n, size_t elsize, const void *from, void *to)
+void gpuget(size_t n, size_t elsize, const void *from, void *to){
-{
+	cublasGetVector(n, elsize, from, 1, to, 1);
-cublasStatus status;
+	TEST_CUBLAS("cublasGetVector");
 status=cublasGetVector(n,elsize,from,1,to,1);
 if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasGetVector");
 }
-void gpuput(size_t n, size_t elsize, const void *from, void *to)
+void gpuput(size_t n, size_t elsize, const void *from, void *to){
-{
+	cublasSetVector(n, elsize, from, 1, to, 1);
-cublasStatus status;
+	TEST_CUBLAS("cublasSetVector");
 status=cublasSetVector(n,elsize,from,1,to,1);
 if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector");
 }
-double *gpuputdouble(const double &x)
+double *gpuputdouble(const double &x){
 {
 cublasStatus status;
 	void *ptr = NULL;
-status = cublasAlloc(1,sizeof(double),&ptr);
+	cublasAlloc(1, sizeof(double), &ptr);
-if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc");
+	TEST_CUBLAS("cublasAlloc");
-status=cublasSetVector(1,sizeof(double),&x,1,ptr,1);
+
-if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector");
+	cublasSetVector(1, sizeof(double), &x, 1, ptr, 1);
 	TEST_CUBLAS("cublasSetVector");
 	return (double *)ptr;
 }
-complex<double> *gpuputcomplex(const complex<double> &x)
+complex<double> *gpuputcomplex(const complex<double> &x){
 {
 cublasStatus status;
 	void *ptr = NULL;
-status = cublasAlloc(1,sizeof(complex<double>),&ptr);
+	cublasAlloc(1, sizeof(complex<double>), &ptr);
-if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc");
+	TEST_CUBLAS("cublasAlloc");
-status=cublasSetVector(1,sizeof(complex<double>),&x,1,ptr,1);
+
-if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector");
+	cublasSetVector(1, sizeof(complex<double>), &x, 1, ptr, 1);
 	TEST_CUBLAS("cublasSetVector");
 	return (complex<double> *)ptr;
 }
 }
 #endif
--- a/cuda_la.h
+++ b/cuda_la.h
@@ -1,6 +1,10 @@
 //------------------------------------------------------------------------------
 /* vim: set ts=8 sw=8 sts=8 noexpandtab cindent: */
 //------------------------------------------------------------------------------
 #ifndef _CUDA_LA_H
 #define _CUDA_LA_H
 #include <errno.h>
 #ifdef CUDALA
 #undef MATPTR
 #include "cublas.h"
@@ -13,6 +17,7 @@ namespace LA {
 #ifdef CUDALA
 #define CPU_GPU(x,y) {if((x)!=cpu && (y)!=cpu) laerror("one operand must be in CPU memory");}
 #define NOT_GPU(x) {if((x).getlocation()!=cpu) laerror("Operation not implemented on GPU (yet). Use .moveto(0) first.");}
 #define NOT_CPU(x) {if((x).getlocation()==cpu) laerror("Operation not implemented on CPU (yet). Use .moveto(>0) first.");}
 #define SAME_LOC(x,y) {if((x).getlocation()!=(y).getlocation()) laerror("Operands have different location. Use .moveto() first.");}
 #define SAME_LOC3(x,y,z) {if((x).getlocation()!=(y).getlocation() || (x).getlocation()!=(z).getlocation()) laerror("Operands have different location. Use .moveto() first.");}
 #else
@@ -22,6 +27,16 @@ namespace LA {
 #define SAME_LOC3(x,y,z) {}
 #endif
 #ifdef DEBUG
 	#ifdef __GNUG__
 		#define TEST_CUBLAS(X) { if(cublasGetError() != CUBLAS_STATUS_SUCCESS){ laerror2(#X, __PRETTY_FUNCTION__); } }
 	#else
 		#define TEST_CUBLAS(X) { if(cublasGetError() != CUBLAS_STATUS_SUCCESS){ laerror2(#X, __func__); } }
 	#endif
 #else
 	#define TEST_CUBLAS(X) {}
 #endif
 typedef enum {undefined=-1, cpu=0, gpu1=1, gpu2=2, gpu3=3, gpu4=4} GPUID;
 #ifdef CUDALA
@@ -33,6 +48,7 @@ public:
 		{
 		cublasStatus status = cublasInit();
 		if (status != CUBLAS_STATUS_SUCCESS) laerror("Cannot init GPU for CUBLAS");
 		errno = 0;
 		}
 	~GPU_START(void)
 		{
@@ -50,8 +66,41 @@ extern complex<double> *gpuputcomplex(const complex<double> &x);
 void set_default_loc(const GPUID loc);
-extern GPUID DEFAULT_LOC;
+template <typename T>
 void smart_gpu_set(size_t n, const T& val, void *gpu_to, size_t _step = 1){
 	void *ptr(NULL);
 	if(sizeof(T)%sizeof(float) != 0){ laerror("memory alignment error"); }
 	cublasAlloc(1, sizeof(T), &ptr);
 	TEST_CUBLAS("cublasAlloc");
 	cublasSetVector(1, sizeof(T), &val, 1, ptr, 1);
 	TEST_CUBLAS("cublasSetVector");
 	if(sizeof(T) == sizeof(float)){
 		cublasScopy(n, (float*)ptr, 0, ((float*)gpu_to), _step);
 		TEST_CUBLAS("cublasScopy");
 	}else if(sizeof(T) == sizeof(double)){
 		cublasDcopy(n, (double*)ptr, 0, ((double*)gpu_to), _step);
 		TEST_CUBLAS("cublasDcopy");
 	}else if(sizeof(T) == sizeof(complex<double>)){
 		cublasZcopy(n, (cuDoubleComplex*)ptr, 0, (cuDoubleComplex*)gpu_to, _step);
 		TEST_CUBLAS("cublasZcopy");
 	}else{
 		for(register int i=0; i<sizeof(T)/sizeof(float); i++){
 			cublasScopy(n, (float*)ptr + i, 0, ((float*)gpu_to) + i, sizeof(T)/sizeof(float)*_step);
 			TEST_CUBLAS("cublasScopy");
 		}
 	}
 	cublasFree(ptr);
 	TEST_CUBLAS("cublasFree");
 }
 extern GPUID DEFAULT_LOC;
 #endif
 }
--- a/la.h
+++ b/la.h
@@ -44,5 +44,12 @@
 #include "sparsesmat.h"
 #include "vec.h"
-
+using namespace LA;
 typedef NRMat<int> NRIMat;
 typedef NRMat<double> NRDMat;
 typedef NRMat<complex<double> > NRCMat;
 typedef NRVec<int> NRIVec;
 typedef NRVec<double> NRDVec;
 typedef NRVec<complex<double> > NRCVec;
 #endif /* _LA_H_ */
--- a/la_traits.h
+++ b/la_traits.h
@@ -32,6 +32,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <iostream>
 #include <limits>
 #include <complex>
--- a/laerror.cc
+++ b/laerror.cc
@@ -42,14 +42,14 @@ bool _LA_count_check=true;
 extern "C" void _findme(void) {}; //for autoconf test we need a function with C linkage
-void laerror(const char *s1)
+void laerror2(const char *s1, const char *s2)
 {
  std::cerr << "LA:ERROR - ";
  std::cout << "LA:ERROR - ";
  if(s1)
  {
-    std::cerr << s1  << "\n";
+    std::cerr << s2 << ": " << s1 << "\n";
-    std::cout << s1  << "\n";
+    std::cout << s2 << ": " << s1 << "\n";
  }
 #ifdef CUDALA
 {
@@ -63,9 +63,9 @@ std::cout << "CUBLAS status = "<<s<<std::endl;
 throw LAerror(s1); 
 }
 //stub for f77 blas called from strassen routine
-extern "C" void xerbla_(const char name[6], int *n)
+extern "C" void xerbla_(const char name[6], int *n){
 {
 	char msg[1024];
 	strcpy(msg,"LAPACK or BLAS error in routine ");
 	strncat(msg,name,6);
@@ -75,9 +75,7 @@ laerror(msg);
 //with atlas-cblas another error routine is necessary
-
+extern "C" void ATL_xerbla(int p, char *rout, char *form, ...){
 extern "C" void ATL_xerbla(int p, char *rout, char *form, ...)
 {
 	char msg0[1024], *msg;
 	va_list argptr;
 	va_start(argptr, form);
@@ -89,8 +87,7 @@ extern "C" void ATL_xerbla(int p, char *rout, char *form, ...)
 	laerror(msg0);
 }
-int cblas_errprn(int ierr, int info, char *form, ...)
+int cblas_errprn(int ierr, int info, char *form, ...) {
 {
 	char msg0[1024], *msg;
 	va_list argptr;
 	va_start(argptr, form);
--- a/laerror.h
+++ b/laerror.h
@@ -29,10 +29,15 @@ class LAerror
 	LAerror(const char *s) {msg=s;};
 	};
-extern void laerror(const char *);
+#ifdef __GNUG__
 	#define laerror(X) { LA::laerror2(X, __PRETTY_FUNCTION__); }
 #else
 	#define laerror(X) { LA::laerror2(X, __func__); }
 #endif
-inline std::ostream & operator<<(std::ostream &s, const LAerror &x)
+extern void laerror2(const char *, const char *);
-{
+
 inline std::ostream & operator<<(std::ostream &s, const LAerror &x) {
 s << x.msg;
 return s;
 }
--- a/mat.cc
+++ b/mat.cc
--- a/mat.h
+++ b/mat.h
--- a/noncblas.cc
+++ b/noncblas.cc
@@ -1,4 +1,5 @@
-/*
+/* vim: set ts=8 sw=8 sts=8 noexpandtab cindent: */
 /*******************************************************************************
    LA: linear algebra C++ interface library
    Copyright (C) 2008 Jiri Pittner <jiri.pittner@jh-inst.cas.cz> or <jiri@pittnerovi.com>
@@ -14,8 +15,7 @@
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
+*******************************************************************************/
 #include "noncblas.h"
 #include "laerror.h"
@@ -27,9 +27,7 @@
 //Level 1 - straightforward wrappers
 extern "C" double FORNAME(ddot) (const FINT *n, const double *x, const FINT *incx, const double *y, const FINT *incy);
-double cblas_ddot(const int N, const double *X, const int incX,
+double cblas_ddot(const int N, const double *X, const int incX, const double *Y, const int incY){
                  const double *Y, const int incY)
 {
 #ifdef FORINT
 	const FINT ntmp=N;
 	const FINT incxtmp=incX;
@@ -41,8 +39,7 @@ return FORNAME(ddot)(&N,X,&incX,Y,&incY);
 }
 extern "C" void FORNAME(dscal) (const FINT *n, const double *a, double *x, const FINT *incx);
-void cblas_dscal(const int N, const double alpha, double *X, const int incX)
+void cblas_dscal(const int N, const double alpha, double *X, const int incX){
 {
 #ifdef FORINT
 	const FINT ntmp=N;
 	const FINT incxtmp=incX;
@@ -53,9 +50,7 @@ FORNAME(dscal) (&N,&alpha,X,&incX);
 }
 extern "C" void FORNAME(dcopy) (const FINT *n, const double *x, const FINT *incx, double *y, const FINT *incy);
-void cblas_dcopy(const int N, const double *X, const int incX,
+void cblas_dcopy(const int N, const double *X, const int incX, double *Y, const int incY){
                 double *Y, const int incY)
 {
 #ifdef FORINT
 	const FINT ntmp=N;
 	const FINT incxtmp=incX;
@@ -67,9 +62,7 @@ FORNAME(dcopy) (&N,X,&incX,Y,&incY);
 }
 extern "C" void FORNAME(daxpy) (const FINT *n, const double *a, const double *x, const FINT *incx, double *y, const FINT *incy);
-void cblas_daxpy(const int N, const double alpha, const double *X,
+void cblas_daxpy(const int N, const double alpha, const double *X, const int incX, double *Y, const int incY){
                 const int incX, double *Y, const int incY)
 {
 #ifdef FORINT
 	const FINT ntmp=N;
 	const FINT incxtmp=incX;
@@ -81,8 +74,7 @@ FORNAME(daxpy) (&N,&alpha,X,&incX,Y,&incY);
 }
 extern "C" double FORNAME(dnrm2) (const FINT *n, const double *x, const FINT *incx);
-double cblas_dnrm2(const int N, const double *X, const int incX)
+double cblas_dnrm2(const int N, const double *X, const int incX){
 {
 #ifdef FORINT
 	const FINT ntmp=N;
 	const FINT incxtmp=incX;
@@ -93,8 +85,7 @@ return FORNAME(dnrm2) (&N,X,&incX);
 }
 extern "C" double FORNAME(dasum) (const FINT *n, const double *x, const FINT *incx);
-double cblas_dasum(const int N, const double *X, const int incX)
+double cblas_dasum(const int N, const double *X, const int incX){
 {
 #ifdef FORINT
 	const FINT ntmp=N;
 	const FINT incxtmp=incX;
@@ -105,9 +96,7 @@ return FORNAME(dasum) (&N,X,&incX);
 }
 extern "C" void FORNAME(zcopy) (const FINT *n, const void *x, const FINT *incx, void *y, const FINT *incy);
-void cblas_zcopy(const int N, const void *X, const int incX,
+void cblas_zcopy(const int N, const void *X, const int incX, void *Y, const int incY){
                 void *Y, const int incY)
 {
 #ifdef FORINT
 	const FINT ntmp=N;
 	const FINT incxtmp=incX;
@@ -119,9 +108,7 @@ FORNAME(zcopy) (&N,X,&incX,Y,&incY);
 }
 extern "C" void FORNAME(zaxpy) (const FINT *n, const void *a, const void *x, const FINT *incx, void *y, const FINT *incy);
-void cblas_zaxpy(const int N, const void *alpha, const void *X,
+void cblas_zaxpy(const int N, const void *alpha, const void *X, const int incX, void *Y, const int incY){
                 const int incX, void *Y, const int incY)
 {
 #ifdef FORINT
 	const FINT ntmp=N;
 	const FINT incxtmp=incX;
@@ -133,8 +120,7 @@ FORNAME(zaxpy) (&N,alpha,X,&incX,Y,&incY);
 }
 extern "C" void FORNAME(zscal) (const FINT *n, const void *a, void *x, const FINT *incx);
-void cblas_zscal(const int N, const void *alpha, void *X, const int incX)
+void cblas_zscal(const int N, const void *alpha, void *X, const int incX){
 {
 #ifdef FORINT
 	const FINT ntmp=N;
 	const FINT incxtmp=incX;
@@ -145,8 +131,7 @@ FORNAME(zscal)(&N,alpha,X,&incX);
 }
 extern "C" void FORNAME(zdscal) (const FINT *n, const double *a, void *x, const FINT *incx);
-void cblas_zdscal(const int N, const double alpha, void *X, const int incX)
+void cblas_zdscal(const int N, const double alpha, void *X, const int incX){
 {
 #ifdef FORINT
 	const FINT ntmp=N;
 	const FINT incxtmp=incX;
@@ -158,8 +143,7 @@ FORNAME(zdscal)(&N,&alpha,X,&incX);
 extern "C" double FORNAME(dznrm2) (const FINT *n, const void *x, const FINT *incx);
-double cblas_dznrm2(const int N, const void *X, const int incX)
+double cblas_dznrm2(const int N, const void *X, const int incX){
 {
 #ifdef FORINT
 	const FINT ntmp=N;
 	const FINT incxtmp=incX;
@@ -171,12 +155,8 @@ return FORNAME(dznrm2) (&N,X,&incX);
 //the following ones are f2c-compatible, but is it truly portable???
 extern "C" void FORNAME(zdotu) (void *retval, const FINT *n, const void *x, const FINT *incx, const void *y, const FINT *incy);
-
+void cblas_zdotu_sub(const int N, const void *X, const int incX, const void *Y, const int incY, void *dotu){
 void   cblas_zdotu_sub(const int N, const void *X, const int incX,
                       const void *Y, const int incY, void *dotu)
 {
 #ifdef FORINT
 	const FINT ntmp=N;
 	const FINT incxtmp=incX;
@@ -189,9 +169,7 @@ FORNAME(zdotu) (dotu,&N,X,&incX,Y,&incY);
 extern "C" void FORNAME(zdotc) (void *retval, const FINT *n, const void *x, const FINT *incx, const void *y, const FINT *incy);
-void   cblas_zdotc_sub(const int N, const void *X, const int incX,
+void cblas_zdotc_sub(const int N, const void *X, const int incX, const void *Y, const int incY, void *dotc){
                       const void *Y, const int incY, void *dotc)
 {
 #ifdef FORINT
 	const FINT ntmp=N;
 	const FINT incxtmp=incX;
@@ -215,8 +193,8 @@ void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
 		const double *X, const int incX,
 		const double beta, double *Y, const int incY)
 {
-if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
+	if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
-if(Uplo!=CblasLower) LA::laerror("CblasLower uplo asserted");
+	if(Uplo!=CblasLower) laerror("CblasLower uplo asserted");
 #ifdef FORINT
 	const FINT ntmp=N;
 	const FINT incxtmp=incX;
@@ -234,8 +212,8 @@ void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
 		const void *X, const int incX,
 		const void *beta, void *Y, const int incY)
 {
-if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
+	if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
-if(Uplo!=CblasLower) LA::laerror("CblasLower uplo asserted");
+	if(Uplo!=CblasLower) laerror("CblasLower uplo asserted");
 #ifdef FORINT
 	const FINT ntmp=N;
 	const FINT incxtmp=incX;
@@ -254,7 +232,7 @@ void cblas_dger(const enum CBLAS_ORDER Order, const int M, const int N,
 		const double alpha, const double *X, const int incX,
 		const double *Y, const int incY, double *A, const int lda)
 {
-if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
+	if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
 	//swap m-n, y-x
 #ifdef FORINT
 	const FINT mtmp=M;
@@ -272,14 +250,14 @@ void cblas_zgerc(const enum CBLAS_ORDER Order, const int M, const int N,
 		const void *alpha, const void *X, const int incX,
 		const void *Y, const int incY, void *A, const int lda)
 {
-LA::laerror("cblas_zgerc cannot be simply converted to fortran order");
+	laerror("cblas_zgerc cannot be simply converted to fortran order");
 }
 void cblas_zgeru(const enum CBLAS_ORDER Order, const int M, const int N,
 		const void *alpha, const void *X, const int incX,
 		const void *Y, const int incY, void *A, const int lda)
 {
-LA::laerror("cblas_zgeru cannot be simply converted to fortran order");
+	laerror("cblas_zgeru cannot be simply converted to fortran order");
 }
 extern "C" void FORNAME(dgemm) (const char *transa, const char *transb, const FINT *m, const FINT *n, const FINT *k, const double *alpha, const double *a, const FINT *lda, const double *b, const FINT *ldb, const double *beta, double *c, const FINT *ldc);
@@ -289,7 +267,7 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
 		const int lda, const double *B, const int ldb,
 		const double beta, double *C, const int ldc)
 {
-if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
+	if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
 	//swap a-b, m-n
 #ifdef FORINT
 	const FINT mtmp=M;
@@ -313,7 +291,7 @@ void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
 		const int lda, const void *B, const int ldb,
 		const void *beta, void *C, const int ldc)
 {
-if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
+	if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
 	//swap a-b, m-n
 #ifdef FORINT
 	const FINT mtmp=M;
@@ -364,8 +342,8 @@ void cblas_zgemv(const enum CBLAS_ORDER Order,
 		const void *X, const int incX, const void *beta,
 		void *Y, const int incY)
 {
-if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
+	if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
-if(TransA == CblasConjTrans) LA::laerror("zgemv with CblasConjTrans not supportted");
+	if(TransA == CblasConjTrans) laerror("zgemv with CblasConjTrans not supportted");
 	//swap n-m and toggle transposition
 #ifdef FORINT
 	const FINT mtmp=M;
@@ -380,9 +358,7 @@ FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, alpha, A, &lda, X, &incX,
 }
 extern "C" FINT FORNAME(idamax) (const FINT *N, const double *DX, const FINT *INCX);
-
+CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX) {
 CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX)
 {
 #ifdef FORINT
 	const FINT ntmp=N;
 	const FINT incxtmp=incX;
@@ -392,8 +368,42 @@ return (CBLAS_INDEX)FORNAME(idamax)(&N,X,&incX);
 #endif
 }
-
+extern "C" FINT FORNAME(izamax) (const FINT *N, const void *DX, const FINT *INCX);
 CBLAS_INDEX cblas_izamax(const int N, const void *X, const int incX) {
 #ifdef FORINT
 	const FINT ntmp=N;
 	const FINT incxtmp=incX;
 	return (CBLAS_INDEX)FORNAME(izamax)(&ntmp, X, &incxtmp);
 #else
 	return (CBLAS_INDEX)FORNAME(izamax)(&N, X, &incX);
 #endif
 }
 /*
 extern "C" FINT FORNAME(idamin) (const FINT *N, const double *DX, const FINT *INCX);
 CBLAS_INDEX cblas_idamin(const int N, const double *X, const int incX) {
 #ifdef FORINT
 	const FINT ntmp=N;
 	const FINT incxtmp=incX;
 	return (CBLAS_INDEX)FORNAME(idamin)(&ntmp,X,&incxtmp);
 #else
 	return (CBLAS_INDEX)FORNAME(idamin)(&N,X,&incX);
 #endif
 }
 extern "C" FINT FORNAME(izamin) (const FINT *N, const void *DX, const FINT *INCX);
 CBLAS_INDEX cblas_izamin(const int N, const void *X, const int incX) {
 #ifdef FORINT
 	const FINT ntmp=N;
 	const FINT incxtmp=incX;
 	return (CBLAS_INDEX)FORNAME(izamin)(&ntmp, X, &incxtmp);
 #else
 	return (CBLAS_INDEX)FORNAME(izamin)(&N, X, &incX);
 #endif
 }
 */
 #endif
 #ifdef NONCLAPACK
 //clapack_dgesv
@@ -405,7 +415,7 @@ int clapack_dgesv(const enum CBLAS_ORDER Order, const int N, const int NRHS,
 		double *B, const int ldb)
 {
 	FINT INFO=0;
-if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
+	if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
 	//B should be in the same physical order, just transpose A in place and the LU result on output
 	for(int i=1; i<N; ++i) for(int j=0; j<i; ++j)  {double t=A[j*lda+i]; A[j*lda+i]=A[i*lda+j]; A[i*lda+j]=t;}
 #ifdef FORINT
@@ -424,12 +434,27 @@ return (int)INFO;
 #endif
-void cblas_dswap(const int N, double *X, const int incX,
+
-                 double *Y, const int incY){
+extern "C" void FORNAME(dswap)(const FINT *N, double *X, const FINT *incX, double *Y, const FINT *incY);
-  LA::laerror("cblas_dswap is not available... (-DNONCBLAS)");
+void cblas_dswap(const int N, double *X, const int incX, double *Y, const int incY){
 #ifdef FORINT
 	const FINT N_tmp = N;
 	const FINT incX_tmp = incX;
 	const FINT incY_tmp = incY;
 	FORNAME(dswap)(&N_tmp, X, &incX_tmp, Y, &incY_tmp);
 #else
 	FORNAME(dswap)(&N, X, &incX, Y, &incY);
 #endif
 }
-void cblas_zswap(const int N, void *X, const int incX,
+extern "C" void FORNAME(zswap)(const FINT *N, void *X, const FINT *incX, void *Y, const FINT *incY);
-                 void *Y, const int incY){
+void cblas_zswap(const int N, void *X, const int incX, void *Y, const int incY){
-  LA::laerror("cblas_zswap is not available... (-DNONCBLAS)");
+#ifdef FORINT
 	const FINT N_tmp = N;
 	const FINT incX_tmp = incX;
 	const FINT incY_tmp = incY;
 	FORNAME(zswap)(&N_tmp, X, &incX_tmp, Y, &incY_tmp);
 #else
 	FORNAME(zswap)(&N, X, &incX, Y, &incY);
 #endif
 }
--- a/noncblas.h
+++ b/noncblas.h
@@ -85,6 +85,11 @@ CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX);
 CBLAS_INDEX cblas_icamax(const int N, const void   *X, const int incX);
 CBLAS_INDEX cblas_izamax(const int N, const void   *X, const int incX);
 /*
 CBLAS_INDEX cblas_idamin(const int N, const double *X, const int incX);
 CBLAS_INDEX cblas_izamin(const int N, const void   *X, const int incX);
 */
 /*
 * ===========================================================================
 * Prototypes for level 1 BLAS routines
--- a/smat.cc
+++ b/smat.cc
@@ -1,3 +1,6 @@
 //------------------------------------------------------------------------------
 /* vim: set ts=8 sw=8 sts=8 noexpandtab cindent: */
 //------------------------------------------------------------------------------
 /*
    LA: linear algebra C++ interface library
    Copyright (C) 2008 Jiri Pittner <jiri.pittner@jh-inst.cas.cz> or <jiri@pittnerovi.com>
@@ -25,50 +28,50 @@
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <errno.h>
 extern "C" {
 	extern ssize_t read(int, void *, size_t);
 	extern ssize_t write(int, const void *, size_t);
 }
 // TODO
 // specialize unary minus
 namespace LA {
-
+/***************************************************************************//**
-/*
+ * routine for raw output 
- *  * Templates first, specializations for BLAS next
+ * @param[in] fd file descriptor for output
- *
+ * @param[in] dim number of elements intended for output
- */
+ * @param[in] transp reserved
-
+ * @see NRMat<T>::get(), NRSMat<T>::copyonwrite()
-//raw I/O
+ ******************************************************************************/
 template <typename T>
-void NRSMat<T>::put(int fd, bool dim, bool transp) const
+void NRSMat<T>::put(int fd, bool dim, bool transp) const {
 {
 #ifdef CUDALA
-if(location!=cpu)
+	if(location != cpu){
        {
 		NRSMat<T> tmp= *this;
 		tmp.moveto(cpu);
 		tmp.put(fd,dim,transp);
 		return;
 	}
 #endif
 	errno = 0;
-if(dim)
+	if(dim){
 {
 		if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write");
 		if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write");
 	}
 	LA_traits<T>::multiput(NN2,fd,v,dim);
 }
 /***************************************************************************//**
 * routine for raw input 
 * @param[in] fd file descriptor for input
 * @param[in] dim number of elements intended for input
 * @param[in] transp reserved
 * @see NRSMat<T>::put(), NRSMat<T>::copyonwrite()
 ******************************************************************************/
 template <typename T>
-void NRSMat<T>::get(int fd, bool dim, bool transp)
+void NRSMat<T>::get(int fd, bool dim, bool transp) {
 {
 #ifdef CUDALA
-if(location!=cpu)
+	if(location != cpu){
        {
 		NRSMat<T> tmp;
 		tmp.moveto(cpu);
 		tmp.get(fd,dim,transp);
@@ -80,164 +83,304 @@ if(location!=cpu)
 	int nn0[2]; //align at least 8-byte
 	errno = 0;
-if(dim)
+	if(dim){
 {
 		if(2*sizeof(int) != read(fd,&nn0,2*sizeof(int))) laerror("cannot read");
 		resize(nn0[0]);
-}
+	}else{
 else 
 		copyonwrite();
 	}
 	LA_traits<T>::multiget(NN2,fd,v,dim);
 }
-// conversion ctor, symmetrize general Mat into SMat
+/***************************************************************************//**
 * constructor symmetrizing given matrix \f$A\f$ of general type <code>T</code> yielding \f$(A+A^\mathrm{T})/2\f$
 * @param[in] rhs matrix \f$A\f$
 ******************************************************************************/
 template <typename T>
-NRSMat<T>::NRSMat(const NRMat<T> &rhs)
+NRSMat<T>::NRSMat(const NRMat<T> &rhs) {
-{
+	NOT_GPU(rhs);
 	nn = rhs.nrows();
 #ifdef DEBUG
-	if (nn != rhs.ncols()) laerror("attempt to convert non-square Mat to SMat");
+	if(nn != rhs.ncols()) laerror("attempt to convert nonsquare NRMat<T> to NRSMat<T>");
 #endif
 #ifdef CUDALA
 	location = rhs.getlocation();
 #endif
 	count = new int;
 	*count = 1;
 	v = new T[NN2];
-	int i, j, k=0;
+	int i, j, k(0);
-	for (i=0; i<nn; i++)
+	for(i=0; i<nn; i++){
-		for (j=0; j<=i;j++) v[k++] = (rhs[i][j] + rhs[j][i])/((T)2);
+		for(j=0; j<=i; j++){
 			v[k++] = (rhs[i][j] + rhs[j][i])/((T)2);
 		}
 	}
 }
-
+/***************************************************************************//**
-
+ * zero out this symmetric matrix of general type <code>T</code> and then set
-// assign to diagonal
+ * the diagonal elements to prescribed value
 * @param[in] a scalar value to be assigned to the diagonal
 * @return reference to the modified matrix
 ******************************************************************************/
 template <typename T>
-NRSMat<T> & NRSMat<T>::operator=(const T &a)
+NRSMat<T> & NRSMat<T>::operator=(const T &a) {
-{
+	NOT_GPU(*this);
 	copyonwrite();
 	memset(v, 0, NN2*sizeof(T));
-	for (int i=0; i<nn; i++) v[i*(i+1)/2+i] = a;
+	for(register int i=0; i<nn; i++) v[i*(i+1)/2 + i] = a;
 	return *this;
 }
-//get diagonal
+/***************************************************************************//**
 * get or divide by the diagonal of real symmetric double-precision matrix
 * @param[in, out] r vector for storing the diagonal
 * @param[in] divide
 * 	\li \c false save the diagonal to vector r
 *	\li \c true divide the vector r by the diagonal elements element-wise
 * @param[in] cache reserved
 * @return
 * 	\li <tt>divide == true</tt> NULL
 *	\li <tt>divide == false</tt> pointer to the first element of r
 ******************************************************************************/
 template <typename T>
-const T* NRSMat<T>::diagonalof(NRVec<T> &r, const bool divide, bool cache) const
+const T* NRSMat<T>::diagonalof(NRVec<T> &r, const bool divide, bool cache) const {
 {
 #ifdef DEBUG
-if(r.size()!=nn) laerror("incompatible vector in diagonalof()");
+	if(r.size() != nn) laerror("incompatible vector in const T* NRSMat<T>::diagonalof(NRVec<T> &, const bool, bool)");
 #endif
 	NOT_GPU(*this);
 	SAME_LOC(*this, r);
 	r.copyonwrite();
-if (divide)
+	if(divide){
-	for (int i=0; i<nn; i++) {T a =v[i*(i+1)/2+i]; if(a!=0.) r[i] /= a;}
+		for(register int i=0; i<nn; i++){
-else
+			const T a = v[i*(i+1)/2+i];
-        for (int i=0; i<nn; i++) r[i] = v[i*(i+1)/2+i];
+			if(a != 0.) r[i] /= a;
 		}
 	}else{
 	        for(register int i=0; i<nn; i++) r[i] = v[i*(i+1)/2+i];
 	}
 	return divide?NULL:&r[0];
 }
-// unary minus
+/***************************************************************************//**
 * implements unary minus operator for this symmetric
 * matrix of general type <code>T</code>
 * @return modified copy of this matrix
 ******************************************************************************/
 template <typename T>
-const NRSMat<T> NRSMat<T>::operator-() const
+const NRSMat<T> NRSMat<T>::operator-() const {
-{
+	NOT_GPU(*this);
-	NRSMat<T> result(nn);
+
-	for(int i=0; i<NN2; i++) result.v[i]= -v[i];
+	NRSMat<T> result(nn, getlocation());
 	for(register int i = 0; i<NN2; i++) result.v[i]= -v[i];
 	return result;
 }
-// trace of Smat
+/***************************************************************************//**
 * implements unary minus operator for this real symmetric matrix
 * @return modified copy of this matrix
 ******************************************************************************/
 template <>
 const NRSMat<double> NRSMat<double>::operator-() const {
 	NRSMat<double> result(nn, getlocation());
 #ifdef CUDALA
 	if(location == cpu){
 #endif
 		memcpy(result.v, v, NN2*sizeof(double));
 		cblas_dscal(NN2, -1., result.v, 1);
 #ifdef CUDALA
 	}else{
 		cublasDcopy(NN2, v, 1, result.v, 1);
 		TEST_CUBLAS("cublasDcopy");
 		cublasDscal(NN2, -1., result.v, 1);
 		TEST_CUBLAS("cublasDscal");
 	}
 #endif
 	return result;
 }
 /***************************************************************************//**
 * implements unary minus operator for this hermitian matrix
 * @return modified copy of this matrix
 ******************************************************************************/
 template <>
 const NRSMat<complex<double> > NRSMat<complex<double> >::operator-() const {
 	NRSMat<complex<double> > result(nn, getlocation());
 #ifdef CUDALA
        if(location == cpu) {
 #endif
 		memcpy(result.v, v, NN2*sizeof(complex<double>));
 		cblas_zscal(NN2, &CMONE, result.v, 1);
 #ifdef CUDALA
        }else{
 		cublasZcopy(NN2, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)result.v, 1);
 		TEST_CUBLAS("cublasZcopy");
 		cublasZscal(NN2, CUMONE, (cuDoubleComplex*)result.v, 1);
 		TEST_CUBLAS("cublasZscal");
 	}
 #endif
 	return result;
 }
 /***************************************************************************//**
 * @return the sum of the diagonal elements
 ******************************************************************************/
 template <typename T>
-const T NRSMat<T>::trace() const
+const T NRSMat<T>::trace() const {
-{
+	NOT_GPU(*this);
 	T tmp = 0;
-	for (int i=0; i<nn; i++) tmp += v[i*(i+1)/2+i];
+	for(register int i=0; i<nn; i++) tmp += v[i*(i+1)/2+i];
 	return tmp;
 }
 /***************************************************************************//**
 * fill this real symmetric matrix with
 * pseudorandom numbers generated from uniform distribution
 ******************************************************************************/
 template<>
-void NRSMat<double>::randomize(const double &x)
+void NRSMat<double>::randomize(const double &x) {
-{
+	NOT_GPU(*this);
-for(int i=0; i<NN2; ++i) v[i] = x*(2.*random()/(1.+RAND_MAX) -1.);
+
 	for(int i=0; i<NN2; ++i){
 		v[i] = x*(2.*random()/(1.+RAND_MAX) -1.);
 	}
 }
 /***************************************************************************//**
 * Fill this hermitian matrix with pseudorandom numbers generated from uniform 
 * distribution. The real and imaginary parts are generated independently.
 ******************************************************************************/
 template<>
-void NRSMat<complex<double> >::randomize(const double &x)
+void NRSMat<complex<double> >::randomize(const double &x) {
-{
+	for(register int i=0; i<NN2; ++i) v[i].real() = x*(2.*random()/(1. + RAND_MAX) -1.);
-for(int i=0; i<NN2; ++i) v[i].real() = x*(2.*random()/(1.+RAND_MAX) -1.);
+	for(register int i=0; i<NN2; ++i) v[i].imag() = x*(2.*random()/(1. + RAND_MAX) -1.);
-for(int i=0; i<NN2; ++i) v[i].imag() = x*(2.*random()/(1.+RAND_MAX) -1.);
+	for(register int i=0; i<nn; ++i){
-for(int i=0; i<nn; ++i) for(int j=0; j<=i; ++j) if(i==j) v[i*(i+1)/2+j].imag()=0; //hermitean 
+		for(register int j=0; j<=i; ++j){
 		       	if(i == j) v[i*(i+1)/2+j].imag() = 0; //hermitean
 		}
 	}
 }
-
+/***************************************************************************//**
-
+ * routine for formatted output via lawritemat
-// write matrix to the file with specific format
+ * @param[in] file pointer to <tt>FILE</tt> structure representing the output file
 * @param[in] format format specification in standard printf-like form
 * @param[in] modulo
 * @see lawritemat()
 ******************************************************************************/
 template <typename T>
-void NRSMat<T>::fprintf(FILE *file, const char *format, const int modulo) const
+void NRSMat<T>::fprintf(FILE *file, const char *format, const int modulo) const {
-{
+	NOT_GPU(*this);
 	lawritemat(file, (const T *)(*this) ,nn, nn, format, 2, modulo, 1);
 }
-// read matrix from the file with specific format
+
 /***************************************************************************//**
 * routine for formatted input via fscanf
 * @param[in] f pointer to <tt>FILE</tt> structure representing the input file
 * @param[in] format format specification in standard printf-like form
 ******************************************************************************/
 template <typename T>
-void NRSMat<T>::fscanf(FILE *f, const char *format)
+void NRSMat<T>::fscanf(FILE *f, const char *format) {
 {
 	int n, m;
 	NOT_GPU(*this);
 	if (::fscanf(f,"%d %d", &n, &m) != 2)
-		laerror("cannot read matrix dimensions in SMat::fscanf");
+		laerror("cannot read matrix dimensions in NRSMat<T>::fscanf(FILE *, const char *)");
-	if (n != m) laerror("different dimensions of SMat");
+	if (n != m) laerror("different dimensions in NRSMat<T>::fscanf(FILE *, const char *)");
 	resize(n);
 	for (int i=0; i<n; i++) 
 		for (int j=0; j<n; j++)
 			if (::fscanf(f,format,&((*this)(i,j))) != 1)
-				laerror("Smat - cannot read matrix element");
+				laerror("NRSMat<T>::fscanf(FILE *, const char *) - unable to read matrix element");
 }
-/*
+/***************************************************************************//**
- * BLAS specializations for double and complex<double>
+ * multiply this real double-precision symmetric matrix \f$S\f$ stored in packed form
- */
+ * with real double-precision dense matrix \f$A\f$
-
+ * @param[in] rhs real double-precision matrix \f$A\f$
-
+ * @return matrix produt \f$S\times{}A\f$
-
+ ******************************************************************************/
 // SMat * Mat
 //NOTE: dsymm is not appropriate as it works on UNPACKED symmetric matrix
 template<>
-const NRMat<double> NRSMat<double>::operator*(const NRMat<double> &rhs) const
+const NRMat<double> NRSMat<double>::operator*(const NRMat<double> &rhs) const {
 {
 #ifdef DEBUG
-	if (nn != rhs.nrows()) laerror("incompatible dimensions in SMat*Mat");
+	if(nn != rhs.nrows()) laerror("incompatible dimensions in NRMat<double> NRSMat<double>::operator*(const NRMat<double> &)");
 #endif
 	SAME_LOC(*this, rhs);
 	NRMat<double> result(nn, rhs.ncols(), getlocation());
 #ifdef CUDALA
 	if(location == cpu){
 #endif
 		for(register int k = 0; k<rhs.ncols(); k++){
 			cblas_dspmv(CblasRowMajor, CblasLower, nn, 1.0, v, rhs[0] + k, rhs.ncols(), 0.0, result[0] + k, rhs.ncols());
 		}
 #ifdef CUDALA
 	}else{
 		for(register int k = 0; k<rhs.ncols(); k++){
 			cublasDspmv('U', nn, 1.0, v, rhs[0] + k, rhs.ncols(), 0.0, result[0] + k, rhs.ncols());
 			TEST_CUBLAS("cublasDspmv");
 		}
 	}
 #endif
 	NRMat<double> result(nn, rhs.ncols());
 	for (int k=0; k<rhs.ncols(); k++)
 		cblas_dspmv(CblasRowMajor, CblasLower, nn, 1.0, v, rhs[0]+k, rhs.ncols(),
 				0.0, result[0]+k, rhs.ncols());
 	return result;
 }
 /***************************************************************************//**
 * multiply this real double-precision symmetric matrix \f$S\f$ stored in packed form
 * with real double-precision dense matrix \f$A\f$
 * @param[in] rhs real double-precision matrix \f$A\f$
 * @return matrix produt \f$S\times{}A\f$
 ******************************************************************************/
 template<>
 const NRMat<complex<double> >
-NRSMat< complex<double> >::operator*(const NRMat< complex<double> > &rhs) const
+NRSMat<complex<double> >::operator*(const NRMat<complex<double> > &rhs) const {
 {
 #ifdef DEBUG
-	if (nn != rhs.nrows()) laerror("incompatible dimensions in SMat*Mat");
+	if (nn != rhs.nrows()) laerror("incompatible dimensions in NRSMat<complex<double> >::operator*(const NRMat<complex<double> > &)");
 #endif
 	SAME_LOC(*this, rhs);
 	NRMat<complex<double> > result(nn, rhs.ncols(), getlocation());
 #ifdef CUDALA
 	if(location == cpu){
 #endif
 		for(register int k=0; k<rhs.ncols(); k++){
 			cblas_zhpmv(CblasRowMajor, CblasLower, nn, &CONE, v, rhs[0]+k, rhs.ncols(), &CZERO, result[0]+k, rhs.ncols());
 		}
 #ifdef CUDALA
 	}else{
 		for(register int k = 0; k<rhs.ncols(); k++){
 			cublasZhpmv('U', nn, 
 					CUONE, (cuDoubleComplex*)v, (cuDoubleComplex*)(rhs[0] + k), rhs.ncols(),
 					CUZERO, (cuDoubleComplex*)(result[0] + k), rhs.ncols());
 			TEST_CUBLAS("cublasDspmv");
 		}
 	}
 #endif
 	NRMat< complex<double> > result(nn, rhs.ncols());
 	for (int k=0; k<rhs.ncols(); k++)
 		cblas_zhpmv(CblasRowMajor, CblasLower, nn, &CONE, v, rhs[0]+k, rhs.ncols(),
 				&CZERO, result[0]+k, rhs.ncols());
 	return result;
 }
-
+/***************************************************************************//**
-
+ * multiply this real double-precision symmetric matrix \f$S\f$ stored in packed form
-// SMat * SMat
+ * with real double-precision symmetric matrix \f$T\f$
 * @return matrix produt \f$S\times{}T\f$ (not necessarily symmetric)
 ******************************************************************************/
 template<>
-const NRMat<double> NRSMat<double>::operator*(const NRSMat<double> &rhs) const
+const NRMat<double> NRSMat<double>::operator*(const NRSMat<double> &rhs) const {
 {
 #ifdef DEBUG
-	if (nn != rhs.nn) laerror("incompatible dimensions in SMat*SMat");
+	if (nn != rhs.nn) laerror("incompatible dimensions in NRMat<double> NRSMat<double>::operator*(const NRSMat<double> &)");
 #endif
 	NRMat<double> result(0.0, nn, nn);
 	double *p, *q;
@@ -283,154 +426,293 @@ const NRMat<double> NRSMat<double>::operator*(const NRSMat<double> &rhs) const
 }
-
+/***************************************************************************//**
 * multiply this complex double-precision symmetric matrix \f$G\f$ stored in packed form
 * with complex double-precision symmetric matrix \f$H\f$
 * @return matrix produt \f$G\times{}H\f$ (not necessarily symmetric)
 ******************************************************************************/
 template<>
 const NRMat<complex<double> > 
-NRSMat< complex<double> >::operator*(const NRSMat< complex<double> > &rhs) const
+NRSMat<complex<double> >::operator*(const NRSMat<complex<double> > &rhs) const {
 {
 #ifdef DEBUG
-	if (nn != rhs.nn) laerror("incompatible dimensions in SMat*SMat");
+	if (nn != rhs.nn) laerror("incompatible dimensions in NRSMat<complex<double> >::operator*(const NRSMat<complex<double> > &)");
 #endif
-	NRMat< complex<double> > result(0.0, nn, nn);
+	SAME_LOC(*this, rhs);
 	NRMat<complex<double> > result(nn, nn, getlocation());
 	NRMat<complex<double> > rhsmat(rhs);
 	result = *this * rhsmat;
 	return result;
 //	laerror("complex SMat*Smat not implemented");
 }
-
+/***************************************************************************//**
-
+ * compute inner product of this real symmetric matrix \f$A\f$ with given real symmetric matrix \f$B\f$
-// S dot S
+ * i.e. determine the value of
 * \f[\sum_{i,j}A_{i,j}B_{i,j}\f]
 * @param[in] rhs matrix \f$B\f$
 * @return computed inner product
 ******************************************************************************/
 template<>
-const double NRSMat<double>::dot(const NRSMat<double> &rhs) const
+const double NRSMat<double>::dot(const NRSMat<double> &rhs) const {
-{
+	double ret(0.);
 #ifdef DEBUG
-	if (nn != rhs.nn) laerror("dot of incompatible SMat's");
+	if (nn != rhs.nn) laerror("incompatible dimensions in double NRSMat<double>::dot(const NRSMat<double> &)");
 #endif
-	return cblas_ddot(NN2, v, 1, rhs.v, 1);
+	SAME_LOC(*this, rhs);
 #ifdef CUDALA
 	if(location == cpu){
 #endif
 		ret = cblas_ddot(NN2, v, 1, rhs.v, 1);
 #ifdef CUDALA
 	}else{
 		ret = cublasDdot(NN2, v, 1, rhs.v, 1);
 	}
 #endif
 	return ret;
 }
-
+/***************************************************************************//**
 * compute inner product of this complex symmetric matrix \f$A\f$ with given complex symmetric matrix \f$B\f$
 * i.e. determine the value of
 * \f[\sum_{i,j}\overbar{A_{i,j}}B_{i,j}\f]
 * @param[in] rhs matrix \f$B\f$
 * @return computed inner product
 ******************************************************************************/
 template<>
-const complex<double> 
+const complex<double> NRSMat<complex<double> >::dot(const NRSMat<complex<double> > &rhs) const {
 NRSMat< complex<double> >::dot(const NRSMat< complex<double> > &rhs) const
 {
 #ifdef DEBUG
-	if (nn != rhs.nn) laerror("dot of incompatible SMat's");
+	if (nn != rhs.nn) laerror("incompatible dimensions in complex<double> NRSMat<complex<double> >::dot(const NRSMat<complex<double> > &)");
 #endif
 	complex<double> dot(0., 0.);
 	SAME_LOC(*this, rhs);
 #ifdef CUDALA
 	if(location == cpu){
 #endif
 	complex<double> dot;
 		cblas_zdotc_sub(NN2, v, 1, rhs.v, 1, &dot);
 #ifdef CUDALA
 	}else{
 		const cuDoubleComplex _dot = cublasZdotc(NN2, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)(rhs.v), 1);
 		dot = complex<double>(cuCreal(_dot), cuCimag(_dot));
 		TEST_CUBLAS("cublasZdotc");
 	}
 #endif
 	return dot;
 }
 /***************************************************************************//**
 * compute inner product of this real double-precision symmetric matrix \f$S\f$ of order \f$n\f$
 * with given real double-precision vector \f$\vec{v}\f$ of length \f$n(n+1)/2\f$
 * @param[in] rhs real double-precision vector \f$\vec{v}\f$
 * @return computed inner product
 ******************************************************************************/
 template<>
-const double NRSMat<double>::dot(const NRVec<double> &rhs) const
+const double NRSMat<double>::dot(const NRVec<double> &rhs) const {
-{
+	double ret(0.0);
 #ifdef DEBUG
-	if (NN2 != rhs.nn) laerror("dot of incompatible SMat's");
+	if(NN2 != rhs.nn) laerror("incompatible dimensions in double NRSMat<double>::dot(const NRVec<double> &)");
 #endif
 	SAME_LOC(*this, rhs);
 #ifdef CUDALA
 	if(location == cpu){
 #endif
 		ret = cblas_ddot(NN2, v, 1, rhs.v, 1);
 #ifdef CUDALA
 	}else{
 		ret = cublasDdot(NN2, v, 1, rhs.v, 1);
 		TEST_CUBLAS("cublasDdot");
 	}
 #endif
 	return cblas_ddot(NN2, v, 1, rhs.v, 1);
 }
-
+/***************************************************************************//**
 * compute inner product of this complex double-precision hermitian matrix \f$H\f$ of order \f$n\f$
 * with given complex double-precision vector \f$\vec{v}\f$ of length \f$n(n+1)/2\f$
 * @param[in] rhs complex double-precision vector \f$\vec{v}\f$
 * @return computed inner product
 ******************************************************************************/
 template<>
 const complex<double> 
-NRSMat< complex<double> >::dot(const NRVec< complex<double> > &rhs) const
+NRSMat<complex<double> >::dot(const NRVec<complex<double> > &rhs) const {
 {
 #ifdef DEBUG
-	if (NN2 != rhs.nn) laerror("dot of incompatible SMat's");
+	if(NN2 != rhs.nn) laerror("incompatible dimensions in complex<double>  NRSMat<complex<double> >::dot(const NRVec<complex<double> > &)");
 #endif
 	complex<double> dot(0., 0.);
 	SAME_LOC(*this, rhs);
 #ifdef CUDALA
 	if(location == cpu){
 #endif
 	complex<double> dot;
 		cblas_zdotc_sub(NN2, v, 1, rhs.v, 1, &dot);
 #ifdef CUDALA
 	}else{
 		const cuDoubleComplex _dot = cublasZdotc(NN2, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)rhs.v, 1);
 		TEST_CUBLAS("cublasZdotc");
 		dot = complex<double>(cuCreal(_dot), cuCimag(_dot));
 	}
 #endif
 	return dot;
 }
-
+/***************************************************************************//**
-// norm of the matrix
+ * compute the Frobenius norm of this real double-precision symmetric  matrix
 * @param[in] scalar subtract this scalar value from the diagonal elements before the norm computation
 ******************************************************************************/
 template<>
-const double  NRSMat<double>::norm(const double scalar) const
+const double NRSMat<double>::norm(const double scalar) const {
-{
+	if(!scalar){
-	if (!scalar) return cblas_dnrm2(NN2, v, 1);
+		double ret(0.);
-	double sum = 0;
+#ifdef CUDALA
-	int k = 0;
+		if(location == cpu){
-	for (int i=0; i<nn; ++i)
+#endif
-		for (int j=0; j<=i; ++j) {
+			ret = cblas_dnrm2(NN2, v, 1);
-			register double tmp;
+#ifdef CUDALA
-			tmp = v[k++];
+		}else{
 			ret = cublasDnrm2(NN2, v, 1);
 			TEST_CUBLAS("cublasDnrm2");
 		}
 #endif
 		return ret;
 	}
 	NOT_GPU(*this);
 	double sum(0.);
 	int k(0);
 	for(register int i=0; i<nn; ++i){
 		for(register int j=0; j<=i; ++j) {
 			register double tmp = v[k++];
 			if(i == j) tmp -= scalar;
 			sum += tmp*tmp;
 		}
 	}
 	return std::sqrt(sum);
 }
-
+/***************************************************************************//**
-
+ * compute the Frobenius norm of this complex double-precision hermitian  matrix
 * @param[in] scalar subtract this scalar value from the diagonal elements before the norm computation
 ******************************************************************************/
 template<>
-const double NRSMat< complex<double> >::norm(const complex<double> scalar) const
+const double NRSMat< complex<double> >::norm(const complex<double> scalar) const {
-{
+	if(!(scalar.real()) && !(scalar.imag())){
-	if (!(scalar.real()) && !(scalar.imag()))
+		double ret(0.);
-		return cblas_dznrm2(NN2, v, 1);
+#ifdef CUDALA
-	double sum = 0;
+		if(location == cpu){
 #endif
 			ret = cblas_dznrm2(NN2, v, 1);
 #ifdef CUDALA
 		}else{
 			ret = cublasDznrm2(NN2, (cuDoubleComplex*)v, 1);
 			TEST_CUBLAS("cublasDznrm2");
 		}
 #endif
 		return ret;
 	}
 	int k(0);
 	double sum(0.);
 	complex<double> tmp;
-	int k = 0;
+
-	for (int i=0; i<nn; ++i)
+	for(register int i=0; i<nn; ++i){
-		for (int j=0; j<=i; ++j) {
+		for(register int j=0; j<=i; ++j){
 			tmp = v[k++];
 			if (i == j) tmp -= scalar;
 			sum += tmp.real()*tmp.real() + tmp.imag()*tmp.imag();
 		}
 	}
 	return std::sqrt(sum);
 }
-
+/***************************************************************************//**
-
+ * for this real double-precision symmetric matrix \f$S\f$ stored in packed form,
-
+ * real scalar value \f$\alpha\f$ and real double-precision symmetric matrix \f$T\f$, compute
-// axpy: S = S * a
+ * \f[S \leftarrow \alpha T + S\f]
 ******************************************************************************/
 template<>
-void NRSMat<double>::axpy(const double alpha, const NRSMat<double> & x)
+void NRSMat<double>::axpy(const double alpha, const NRSMat<double> &x) {
 {
 #ifdef DEBUG
-	if (nn != x.nn) laerror("axpy of incompatible SMats");
+	if(nn != x.nn) laerror("incompatible dimensions in void NRSMat<double>::axpy(const double, const NRSMat<double>&)");
 #endif
 	SAME_LOC(*this, x);
 	copyonwrite();
 #ifdef CUDALA
 	if(location == cpu){
 #endif
 		cblas_daxpy(NN2, alpha, x.v, 1, v, 1);
 #ifdef CUDALA
 	}else{
 		cublasDaxpy(NN2, alpha, x.v, 1, v, 1);
 		TEST_CUBLAS("cublasDaxpy");
 	}
 template<>
 void NRSMat< complex<double> >::axpy(const complex<double> alpha,
 			const NRSMat< complex<double> > & x)
 {
 #ifdef DEBUG
 	if (nn != x.nn) laerror("axpy of incompatible SMats");
 #endif
 	copyonwrite();
 	cblas_zaxpy(nn, &alpha, x.v, 1, v, 1);
 }
-//complex from real
+
 /***************************************************************************//**
 * for this complex double-precision hermitian matrix \f$H\f$ stored in packed form,
 * complex scalar value \f$\alpha\f$ and complex double-precision hermitian matrix \f$G\f$, compute
 * \f[H \leftarrow \alpha G + H\f]
 ******************************************************************************/
 template<>
-NRSMat<complex<double> >::NRSMat(const NRSMat<double> &rhs, bool imagpart)
+void NRSMat<complex<double> >::axpy(const complex<double> alpha, const NRSMat<complex<double> > & x) {
-: nn(rhs.nrows()),  v(new complex<double>[rhs.nrows()*(rhs.nrows()+1)/2]), count(new int(1))
+#ifdef DEBUG
-{
+	if(nn != x.nn) laerror("incompatible dimensions in void NRSMat<complex<double> >::axpy(const complex<double> , const NRSMat<complex<double> >&)");
-memset(v,0,nn*(nn+1)/2*sizeof(complex<double>));
+#endif
-cblas_dcopy(nn*(nn+1)/2,&rhs(0,0),1,((double *)v) + (imagpart?1:0),2);
+	SAME_LOC(*this, x);
 	copyonwrite();
 #ifdef CUDALA
 	if(location == cpu){
 #endif
 		cblas_zaxpy(nn, &alpha, x.v, 1, v, 1);
 #ifdef CUDALA
 	}else{
 		const cuDoubleComplex _alpha = make_cuDoubleComplex(alpha.real(), alpha.imag());
 		cublasZaxpy(NN2, _alpha, (cuDoubleComplex*)x.v, 1, (cuDoubleComplex*)v, 1);
 		TEST_CUBLAS("cublasZaxpy");
 	}
 #endif
 }
 /***************************************************************************//**
 * create hermitian matrix \f$H\f$ from given real double-precision symmetric
 * matrix \f$S\f$
 * @param[in] rhs real double-precision symmetric matrix \f$S\f$
 * @param[in] imagpart flag determining whether \f$S\f$ should correspond to the real or imaginary part of \f$H\f$
 ******************************************************************************/
 template<>
 NRSMat<complex<double> >::NRSMat(const NRSMat<double> &rhs, bool imagpart): nn(rhs.nrows()), count(new int(1)) {
 	//inconsistent in general case?
 	const int nnp1 = nn*(nn + 1)/2;
 #ifdef CUDALA
 	location = rhs.getlocation();
 	if(location == cpu){
 #endif
 		v = new complex<double>[nnp1];
 		memset(v, 0, nnp1*sizeof(complex<double>));
 		cblas_dcopy(nnp1, &rhs(0, 0), 1, ((double *)v) + (imagpart?1:0), 2);
 #ifdef CUDALA
 	}else{
 		v = (complex<double>*) gpualloc(nnp1*sizeof(complex<double>));
-//some template specializations leading to BLAS/CUBLAS calls
+		complex<double> *_val = gpuputcomplex(CZERO);
 		cublasZcopy(nnp1, (cuDoubleComplex*)_val, 0, (cuDoubleComplex*)v, 1);
 		TEST_CUBLAS("cublasZcopy");
 		gpufree(_val);
 		cublasDcopy(nnp1, (double*)(&rhs(0,0)), 1, ((double*)v) + (imagpart?1:0), 2);
 		TEST_CUBLAS("cublasDcopy"); 
 	}
 #endif
 }
-
+/***************************************************************************//**
-
+ * forced instantization in the corresponding object file
-//////////////////////////////////////////////////////////////////////////////
+ ******************************************************************************/
 ////// forced instantization in the corresponding object file
 template class NRSMat<double>;
 template class NRSMat<complex<double> >;
--- a/smat.h
+++ b/smat.h
--- a/vec.cc
+++ b/vec.cc
--- a/vec.h
+++ b/vec.h