cuda

2010-09-08 16:27:58 +00:00
parent e8cbf9e5fb
commit e580467e5a
14 changed files with 7106 additions and 3691 deletions
--- a/smat.cc
+++ b/smat.cc
@@ -1,3 +1,6 @@
+//------------------------------------------------------------------------------
+/* vim: set ts=8 sw=8 sts=8 noexpandtab cindent: */
+//------------------------------------------------------------------------------
 /*
    LA: linear algebra C++ interface library
    Copyright (C) 2008 Jiri Pittner <jiri.pittner@jh-inst.cas.cz> or <jiri@pittnerovi.com>
@@ -25,219 +28,359 @@
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <errno.h>
+
 extern "C" {
-extern ssize_t read(int, void *, size_t);
-extern ssize_t write(int, const void *, size_t);
+	extern ssize_t read(int, void *, size_t);
+	extern ssize_t write(int, const void *, size_t);
 }
-// TODO
-// specialize unary minus

 namespace LA {

-
-/*
- *  * Templates first, specializations for BLAS next
- *
- */
-
-//raw I/O
+/***************************************************************************//**
+ * routine for raw output 
+ * @param[in] fd file descriptor for output
+ * @param[in] dim number of elements intended for output
+ * @param[in] transp reserved
+ * @see NRMat<T>::get(), NRSMat<T>::copyonwrite()
+ ******************************************************************************/
 template <typename T>
-void NRSMat<T>::put(int fd, bool dim, bool transp) const
-{
+void NRSMat<T>::put(int fd, bool dim, bool transp) const {
 #ifdef CUDALA
-if(location!=cpu)
-        {
-        NRSMat<T> tmp= *this;
-        tmp.moveto(cpu);
-        tmp.put(fd,dim,transp);
-        return;
-        }
+	if(location != cpu){
+		NRSMat<T> tmp= *this;
+		tmp.moveto(cpu);
+		tmp.put(fd,dim,transp);
+		return;
+	}
+#endif
+	errno = 0;
+	if(dim){
+		if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write");
+		if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write");
+	}
+	LA_traits<T>::multiput(NN2,fd,v,dim);
+}
+
+/***************************************************************************//**
+ * routine for raw input 
+ * @param[in] fd file descriptor for input
+ * @param[in] dim number of elements intended for input
+ * @param[in] transp reserved
+ * @see NRSMat<T>::put(), NRSMat<T>::copyonwrite()
+ ******************************************************************************/
+template <typename T>
+void NRSMat<T>::get(int fd, bool dim, bool transp) {
+#ifdef CUDALA
+	if(location != cpu){
+		NRSMat<T> tmp;
+		tmp.moveto(cpu);
+		tmp.get(fd,dim,transp);
+		tmp.moveto(location);
+		*this = tmp;
+		return;
+	}
 #endif

-errno=0;
-if(dim)
-{
-if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write");
-if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write");
-}
-LA_traits<T>::multiput(NN2,fd,v,dim);
+	int nn0[2]; //align at least 8-byte
+	errno = 0;
+	if(dim){
+		if(2*sizeof(int) != read(fd,&nn0,2*sizeof(int))) laerror("cannot read");
+		resize(nn0[0]);
+	}else{
+		copyonwrite();
+	}
+	LA_traits<T>::multiget(NN2,fd,v,dim);
 }

+
+/***************************************************************************//**
+ * constructor symmetrizing given matrix \f$A\f$ of general type <code>T</code> yielding \f$(A+A^\mathrm{T})/2\f$
+ * @param[in] rhs matrix \f$A\f$
+ ******************************************************************************/
 template <typename T>
-void NRSMat<T>::get(int fd, bool dim, bool transp)
-{
-#ifdef CUDALA
-if(location!=cpu)
-        {
-        NRSMat<T> tmp;
-        tmp.moveto(cpu);
-        tmp.get(fd,dim,transp);
-        tmp.moveto(location);
-        *this = tmp;
-        return;
-        }
-#endif
+NRSMat<T>::NRSMat(const NRMat<T> &rhs) {
+	NOT_GPU(rhs);

-int nn0[2]; //align at least 8-byte
-errno=0;
-if(dim)
-{
-if(2*sizeof(int) != read(fd,&nn0,2*sizeof(int))) laerror("cannot read");
-resize(nn0[0]);
-}
-else 
-copyonwrite();
-LA_traits<T>::multiget(NN2,fd,v,dim);
-}
-
-
-// conversion ctor, symmetrize general Mat into SMat
-template <typename T>
-NRSMat<T>::NRSMat(const NRMat<T> &rhs)
-{
-nn=rhs.nrows();
+	nn = rhs.nrows();
 #ifdef DEBUG
-	if (nn != rhs.ncols()) laerror("attempt to convert non-square Mat to SMat");
+	if(nn != rhs.ncols()) laerror("attempt to convert nonsquare NRMat<T> to NRSMat<T>");
+#endif
+#ifdef CUDALA
+	location = rhs.getlocation();
 #endif
 	count = new int;
 	*count = 1;
 	v = new T[NN2];
-	int i, j, k=0;
-	for (i=0; i<nn; i++)
-		for (j=0; j<=i;j++) v[k++] = (rhs[i][j] + rhs[j][i])/((T)2);
+	int i, j, k(0);
+	for(i=0; i<nn; i++){
+		for(j=0; j<=i; j++){
+			v[k++] = (rhs[i][j] + rhs[j][i])/((T)2);
+		}
+	}
 }

-
-
-// assign to diagonal
+/***************************************************************************//**
+ * zero out this symmetric matrix of general type <code>T</code> and then set
+ * the diagonal elements to prescribed value
+ * @param[in] a scalar value to be assigned to the diagonal
+ * @return reference to the modified matrix
+ ******************************************************************************/
 template <typename T>
-NRSMat<T> & NRSMat<T>::operator=(const T &a)
-{
+NRSMat<T> & NRSMat<T>::operator=(const T &a) {
+	NOT_GPU(*this);
 	copyonwrite();
-	memset(v,0,NN2*sizeof(T));
-	for (int i=0; i<nn; i++) v[i*(i+1)/2+i] = a;
+	memset(v, 0, NN2*sizeof(T));
+	for(register int i=0; i<nn; i++) v[i*(i+1)/2 + i] = a;
 	return *this;
 }

-//get diagonal
+/***************************************************************************//**
+ * get or divide by the diagonal of real symmetric double-precision matrix
+ * @param[in, out] r vector for storing the diagonal
+ * @param[in] divide
+ * 	\li \c false save the diagonal to vector r
+ *	\li \c true divide the vector r by the diagonal elements element-wise
+ * @param[in] cache reserved
+ * @return
+ * 	\li <tt>divide == true</tt> NULL
+ *	\li <tt>divide == false</tt> pointer to the first element of r
+ ******************************************************************************/
 template <typename T>
-const T* NRSMat<T>::diagonalof(NRVec<T> &r, const bool divide, bool cache) const
-{
+const T* NRSMat<T>::diagonalof(NRVec<T> &r, const bool divide, bool cache) const {
 #ifdef DEBUG
-if(r.size()!=nn) laerror("incompatible vector in diagonalof()");
+	if(r.size() != nn) laerror("incompatible vector in const T* NRSMat<T>::diagonalof(NRVec<T> &, const bool, bool)");
 #endif
+	NOT_GPU(*this);
+	SAME_LOC(*this, r);

-r.copyonwrite();
+	r.copyonwrite();

-if (divide)
-	for (int i=0; i<nn; i++) {T a =v[i*(i+1)/2+i]; if(a!=0.) r[i] /= a;}
-else
-        for (int i=0; i<nn; i++) r[i] = v[i*(i+1)/2+i];
-return divide?NULL:&r[0];
+	if(divide){
+		for(register int i=0; i<nn; i++){
+			const T a = v[i*(i+1)/2+i];
+			if(a != 0.) r[i] /= a;
+		}
+	}else{
+	        for(register int i=0; i<nn; i++) r[i] = v[i*(i+1)/2+i];
+	}
+
+	return divide?NULL:&r[0];
 }


-// unary minus
+/***************************************************************************//**
+ * implements unary minus operator for this symmetric
+ * matrix of general type <code>T</code>
+ * @return modified copy of this matrix
+ ******************************************************************************/
 template <typename T>
-const NRSMat<T> NRSMat<T>::operator-() const
-{
-	NRSMat<T> result(nn);
-	for(int i=0; i<NN2; i++) result.v[i]= -v[i];
+const NRSMat<T> NRSMat<T>::operator-() const {
+	NOT_GPU(*this);
+
+	NRSMat<T> result(nn, getlocation());
+	for(register int i = 0; i<NN2; i++) result.v[i]= -v[i];
 	return result;
 }

-// trace of Smat
+/***************************************************************************//**
+ * implements unary minus operator for this real symmetric matrix
+ * @return modified copy of this matrix
+ ******************************************************************************/
+template <>
+const NRSMat<double> NRSMat<double>::operator-() const {
+	NRSMat<double> result(nn, getlocation());
+#ifdef CUDALA
+	if(location == cpu){
+#endif
+		memcpy(result.v, v, NN2*sizeof(double));
+		cblas_dscal(NN2, -1., result.v, 1);
+#ifdef CUDALA
+	}else{
+		cublasDcopy(NN2, v, 1, result.v, 1);
+		TEST_CUBLAS("cublasDcopy");
+
+		cublasDscal(NN2, -1., result.v, 1);
+		TEST_CUBLAS("cublasDscal");
+	}
+#endif
+	return result;
+}
+
+/***************************************************************************//**
+ * implements unary minus operator for this hermitian matrix
+ * @return modified copy of this matrix
+ ******************************************************************************/
+template <>
+const NRSMat<complex<double> > NRSMat<complex<double> >::operator-() const {
+	NRSMat<complex<double> > result(nn, getlocation());
+#ifdef CUDALA
+        if(location == cpu) {
+#endif
+		memcpy(result.v, v, NN2*sizeof(complex<double>));
+		cblas_zscal(NN2, &CMONE, result.v, 1);
+
+#ifdef CUDALA
+        }else{
+		cublasZcopy(NN2, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)result.v, 1);
+		TEST_CUBLAS("cublasZcopy");
+
+		cublasZscal(NN2, CUMONE, (cuDoubleComplex*)result.v, 1);
+		TEST_CUBLAS("cublasZscal");
+	}
+#endif
+	return result;
+}
+
+/***************************************************************************//**
+ * @return the sum of the diagonal elements
+ ******************************************************************************/
 template <typename T>
-const T NRSMat<T>::trace() const
-{
+const T NRSMat<T>::trace() const {
+	NOT_GPU(*this);
+
 	T tmp = 0;
-	for (int i=0; i<nn; i++) tmp += v[i*(i+1)/2+i];
+	for(register int i=0; i<nn; i++) tmp += v[i*(i+1)/2+i];
 	return tmp;
 }

+/***************************************************************************//**
+ * fill this real symmetric matrix with
+ * pseudorandom numbers generated from uniform distribution
+ ******************************************************************************/
 template<>
-void NRSMat<double>::randomize(const double &x)
-{
-for(int i=0; i<NN2; ++i) v[i] = x*(2.*random()/(1.+RAND_MAX) -1.);
+void NRSMat<double>::randomize(const double &x) {
+	NOT_GPU(*this);
+
+	for(int i=0; i<NN2; ++i){
+		v[i] = x*(2.*random()/(1.+RAND_MAX) -1.);
+	}
 }

+/***************************************************************************//**
+ * Fill this hermitian matrix with pseudorandom numbers generated from uniform 
+ * distribution. The real and imaginary parts are generated independently.
+ ******************************************************************************/
 template<>
-void NRSMat<complex<double> >::randomize(const double &x)
-{
-for(int i=0; i<NN2; ++i) v[i].real() = x*(2.*random()/(1.+RAND_MAX) -1.);
-for(int i=0; i<NN2; ++i) v[i].imag() = x*(2.*random()/(1.+RAND_MAX) -1.);
-for(int i=0; i<nn; ++i) for(int j=0; j<=i; ++j) if(i==j) v[i*(i+1)/2+j].imag()=0; //hermitean 
+void NRSMat<complex<double> >::randomize(const double &x) {
+	for(register int i=0; i<NN2; ++i) v[i].real() = x*(2.*random()/(1. + RAND_MAX) -1.);
+	for(register int i=0; i<NN2; ++i) v[i].imag() = x*(2.*random()/(1. + RAND_MAX) -1.);
+	for(register int i=0; i<nn; ++i){
+		for(register int j=0; j<=i; ++j){
+		       	if(i == j) v[i*(i+1)/2+j].imag() = 0; //hermitean
+		}
+	}
 }

-
-
-// write matrix to the file with specific format
+/***************************************************************************//**
+ * routine for formatted output via lawritemat
+ * @param[in] file pointer to <tt>FILE</tt> structure representing the output file
+ * @param[in] format format specification in standard printf-like form
+ * @param[in] modulo
+ * @see lawritemat()
+ ******************************************************************************/
 template <typename T>
-void NRSMat<T>::fprintf(FILE *file, const char *format, const int modulo) const
-{
+void NRSMat<T>::fprintf(FILE *file, const char *format, const int modulo) const {
+	NOT_GPU(*this);
+
 	lawritemat(file, (const T *)(*this) ,nn, nn, format, 2, modulo, 1);
 }

-// read matrix from the file with specific format
+
+/***************************************************************************//**
+ * routine for formatted input via fscanf
+ * @param[in] f pointer to <tt>FILE</tt> structure representing the input file
+ * @param[in] format format specification in standard printf-like form
+ ******************************************************************************/
 template <typename T>
-void NRSMat<T>::fscanf(FILE *f, const char *format)
-{
+void NRSMat<T>::fscanf(FILE *f, const char *format) {
 	int n, m;
-	if (::fscanf(f,"%d %d",&n,&m) != 2)
-		laerror("cannot read matrix dimensions in SMat::fscanf");
-	if (n != m) laerror("different dimensions of SMat");
+	NOT_GPU(*this);
+
+	if (::fscanf(f,"%d %d", &n, &m) != 2)
+		laerror("cannot read matrix dimensions in NRSMat<T>::fscanf(FILE *, const char *)");
+	if (n != m) laerror("different dimensions in NRSMat<T>::fscanf(FILE *, const char *)");
 	resize(n);
 	for (int i=0; i<n; i++) 
 		for (int j=0; j<n; j++)
 			if (::fscanf(f,format,&((*this)(i,j))) != 1)
-				laerror("Smat - cannot read matrix element");
+				laerror("NRSMat<T>::fscanf(FILE *, const char *) - unable to read matrix element");
 }


-/*
- * BLAS specializations for double and complex<double>
- */
-
-
-
-// SMat * Mat
-//NOTE: dsymm is not appropriate as it works on UNPACKED symmetric matrix
+/***************************************************************************//**
+ * multiply this real double-precision symmetric matrix \f$S\f$ stored in packed form
+ * with real double-precision dense matrix \f$A\f$
+ * @param[in] rhs real double-precision matrix \f$A\f$
+ * @return matrix produt \f$S\times{}A\f$
+ ******************************************************************************/
 template<>
-const NRMat<double> NRSMat<double>::operator*(const NRMat<double> &rhs) const
-{
+const NRMat<double> NRSMat<double>::operator*(const NRMat<double> &rhs) const {
 #ifdef DEBUG
-	if (nn != rhs.nrows()) laerror("incompatible dimensions in SMat*Mat");
+	if(nn != rhs.nrows()) laerror("incompatible dimensions in NRMat<double> NRSMat<double>::operator*(const NRMat<double> &)");
+#endif
+	SAME_LOC(*this, rhs);
+	NRMat<double> result(nn, rhs.ncols(), getlocation());
+#ifdef CUDALA
+	if(location == cpu){
+#endif
+		for(register int k = 0; k<rhs.ncols(); k++){
+			cblas_dspmv(CblasRowMajor, CblasLower, nn, 1.0, v, rhs[0] + k, rhs.ncols(), 0.0, result[0] + k, rhs.ncols());
+		}
+#ifdef CUDALA
+	}else{
+		for(register int k = 0; k<rhs.ncols(); k++){
+			cublasDspmv('U', nn, 1.0, v, rhs[0] + k, rhs.ncols(), 0.0, result[0] + k, rhs.ncols());
+			TEST_CUBLAS("cublasDspmv");
+		}
+	}
 #endif
-	NRMat<double> result(nn, rhs.ncols());
-	for (int k=0; k<rhs.ncols(); k++)
-		cblas_dspmv(CblasRowMajor, CblasLower, nn, 1.0, v, rhs[0]+k, rhs.ncols(),
-				0.0, result[0]+k, rhs.ncols());
 	return result;
 }


+/***************************************************************************//**
+ * multiply this real double-precision symmetric matrix \f$S\f$ stored in packed form
+ * with real double-precision dense matrix \f$A\f$
+ * @param[in] rhs real double-precision matrix \f$A\f$
+ * @return matrix produt \f$S\times{}A\f$
+ ******************************************************************************/
 template<>
-const NRMat< complex<double> >
-NRSMat< complex<double> >::operator*(const NRMat< complex<double> > &rhs) const
-{
+const NRMat<complex<double> >
+NRSMat<complex<double> >::operator*(const NRMat<complex<double> > &rhs) const {
 #ifdef DEBUG
-	if (nn != rhs.nrows()) laerror("incompatible dimensions in SMat*Mat");
+	if (nn != rhs.nrows()) laerror("incompatible dimensions in NRSMat<complex<double> >::operator*(const NRMat<complex<double> > &)");
+#endif
+	SAME_LOC(*this, rhs);
+	NRMat<complex<double> > result(nn, rhs.ncols(), getlocation());
+#ifdef CUDALA
+	if(location == cpu){
+#endif
+		for(register int k=0; k<rhs.ncols(); k++){
+			cblas_zhpmv(CblasRowMajor, CblasLower, nn, &CONE, v, rhs[0]+k, rhs.ncols(), &CZERO, result[0]+k, rhs.ncols());
+		}
+#ifdef CUDALA
+	}else{
+		for(register int k = 0; k<rhs.ncols(); k++){
+			cublasZhpmv('U', nn, 
+					CUONE, (cuDoubleComplex*)v, (cuDoubleComplex*)(rhs[0] + k), rhs.ncols(),
+					CUZERO, (cuDoubleComplex*)(result[0] + k), rhs.ncols());
+
+			TEST_CUBLAS("cublasDspmv");
+		}
+	}
 #endif
-	NRMat< complex<double> > result(nn, rhs.ncols());
-	for (int k=0; k<rhs.ncols(); k++)
-		cblas_zhpmv(CblasRowMajor, CblasLower, nn, &CONE, v, rhs[0]+k, rhs.ncols(),
-				&CZERO, result[0]+k, rhs.ncols());
 	return result;
 }

-
-
-// SMat * SMat
+/***************************************************************************//**
+ * multiply this real double-precision symmetric matrix \f$S\f$ stored in packed form
+ * with real double-precision symmetric matrix \f$T\f$
+ * @return matrix produt \f$S\times{}T\f$ (not necessarily symmetric)
+ ******************************************************************************/
 template<>
-const NRMat<double> NRSMat<double>::operator*(const NRSMat<double> &rhs) const
-{
+const NRMat<double> NRSMat<double>::operator*(const NRSMat<double> &rhs) const {
 #ifdef DEBUG
-	if (nn != rhs.nn) laerror("incompatible dimensions in SMat*SMat");
+	if (nn != rhs.nn) laerror("incompatible dimensions in NRMat<double> NRSMat<double>::operator*(const NRSMat<double> &)");
 #endif
 	NRMat<double> result(0.0, nn, nn);
 	double *p, *q;
@@ -283,156 +426,295 @@ const NRMat<double> NRSMat<double>::operator*(const NRSMat<double> &rhs) const
 }


-
+/***************************************************************************//**
+ * multiply this complex double-precision symmetric matrix \f$G\f$ stored in packed form
+ * with complex double-precision symmetric matrix \f$H\f$
+ * @return matrix produt \f$G\times{}H\f$ (not necessarily symmetric)
+ ******************************************************************************/
 template<>
-const NRMat< complex<double> > 
-NRSMat< complex<double> >::operator*(const NRSMat< complex<double> > &rhs) const
-{
+const NRMat<complex<double> > 
+NRSMat<complex<double> >::operator*(const NRSMat<complex<double> > &rhs) const {
 #ifdef DEBUG
-	if (nn != rhs.nn) laerror("incompatible dimensions in SMat*SMat");
+	if (nn != rhs.nn) laerror("incompatible dimensions in NRSMat<complex<double> >::operator*(const NRSMat<complex<double> > &)");
 #endif
-	NRMat< complex<double> > result(0.0, nn, nn);
-	NRMat< complex<double> > rhsmat(rhs);
+	SAME_LOC(*this, rhs);
+	NRMat<complex<double> > result(nn, nn, getlocation());
+	NRMat<complex<double> > rhsmat(rhs);
 	result = *this * rhsmat;
 	return result;
-//	laerror("complex SMat*Smat not implemented");
 }


-
-
-// S dot S
+/***************************************************************************//**
+ * compute inner product of this real symmetric matrix \f$A\f$ with given real symmetric matrix \f$B\f$
+ * i.e. determine the value of
+ * \f[\sum_{i,j}A_{i,j}B_{i,j}\f]
+ * @param[in] rhs matrix \f$B\f$
+ * @return computed inner product
+ ******************************************************************************/
 template<>
-const double NRSMat<double>::dot(const NRSMat<double> &rhs) const
-{
+const double NRSMat<double>::dot(const NRSMat<double> &rhs) const {
+	double ret(0.);
 #ifdef DEBUG
-	if (nn != rhs.nn) laerror("dot of incompatible SMat's");
+	if (nn != rhs.nn) laerror("incompatible dimensions in double NRSMat<double>::dot(const NRSMat<double> &)");
 #endif
-	return cblas_ddot(NN2, v, 1, rhs.v, 1);
+	SAME_LOC(*this, rhs);
+#ifdef CUDALA
+	if(location == cpu){
+#endif
+		ret = cblas_ddot(NN2, v, 1, rhs.v, 1);
+#ifdef CUDALA
+	}else{
+		ret = cublasDdot(NN2, v, 1, rhs.v, 1);
+	}
+#endif
+	return ret;
 }


-
+/***************************************************************************//**
+ * compute inner product of this complex symmetric matrix \f$A\f$ with given complex symmetric matrix \f$B\f$
+ * i.e. determine the value of
+ * \f[\sum_{i,j}\overbar{A_{i,j}}B_{i,j}\f]
+ * @param[in] rhs matrix \f$B\f$
+ * @return computed inner product
+ ******************************************************************************/
 template<>
-const complex<double> 
-NRSMat< complex<double> >::dot(const NRSMat< complex<double> > &rhs) const
-{
+const complex<double> NRSMat<complex<double> >::dot(const NRSMat<complex<double> > &rhs) const {
 #ifdef DEBUG
-	if (nn != rhs.nn) laerror("dot of incompatible SMat's");
+	if (nn != rhs.nn) laerror("incompatible dimensions in complex<double> NRSMat<complex<double> >::dot(const NRSMat<complex<double> > &)");
+#endif
+	complex<double> dot(0., 0.);
+	SAME_LOC(*this, rhs);
+
+#ifdef CUDALA
+	if(location == cpu){
+#endif
+		cblas_zdotc_sub(NN2, v, 1, rhs.v, 1, &dot);
+#ifdef CUDALA
+	}else{
+		const cuDoubleComplex _dot = cublasZdotc(NN2, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)(rhs.v), 1);
+		dot = complex<double>(cuCreal(_dot), cuCimag(_dot));
+		TEST_CUBLAS("cublasZdotc");
+	}
 #endif
-	complex<double> dot;
-	cblas_zdotc_sub(NN2, v, 1, rhs.v, 1, &dot);
 	return dot;
 }


+/***************************************************************************//**
+ * compute inner product of this real double-precision symmetric matrix \f$S\f$ of order \f$n\f$
+ * with given real double-precision vector \f$\vec{v}\f$ of length \f$n(n+1)/2\f$
+ * @param[in] rhs real double-precision vector \f$\vec{v}\f$
+ * @return computed inner product
+ ******************************************************************************/
 template<>
-const double NRSMat<double>::dot(const NRVec<double> &rhs) const
-{
+const double NRSMat<double>::dot(const NRVec<double> &rhs) const {
+	double ret(0.0);
 #ifdef DEBUG
-	if (NN2 != rhs.nn) laerror("dot of incompatible SMat's");
+	if(NN2 != rhs.nn) laerror("incompatible dimensions in double NRSMat<double>::dot(const NRVec<double> &)");
+#endif
+	SAME_LOC(*this, rhs);
+#ifdef CUDALA
+	if(location == cpu){
+#endif
+		ret = cblas_ddot(NN2, v, 1, rhs.v, 1);
+#ifdef CUDALA
+	}else{
+		ret = cublasDdot(NN2, v, 1, rhs.v, 1);
+		TEST_CUBLAS("cublasDdot");
+	}
 #endif
-	return cblas_ddot(NN2, v, 1, rhs.v, 1);
 }


-
+/***************************************************************************//**
+ * compute inner product of this complex double-precision hermitian matrix \f$H\f$ of order \f$n\f$
+ * with given complex double-precision vector \f$\vec{v}\f$ of length \f$n(n+1)/2\f$
+ * @param[in] rhs complex double-precision vector \f$\vec{v}\f$
+ * @return computed inner product
+ ******************************************************************************/
 template<>
 const complex<double> 
-NRSMat< complex<double> >::dot(const NRVec< complex<double> > &rhs) const
-{
+NRSMat<complex<double> >::dot(const NRVec<complex<double> > &rhs) const {
 #ifdef DEBUG
-	if (NN2 != rhs.nn) laerror("dot of incompatible SMat's");
+	if(NN2 != rhs.nn) laerror("incompatible dimensions in complex<double>  NRSMat<complex<double> >::dot(const NRVec<complex<double> > &)");
+#endif
+	complex<double> dot(0., 0.);
+	SAME_LOC(*this, rhs);
+#ifdef CUDALA
+	if(location == cpu){
+#endif
+		cblas_zdotc_sub(NN2, v, 1, rhs.v, 1, &dot);
+#ifdef CUDALA
+	}else{
+		const cuDoubleComplex _dot = cublasZdotc(NN2, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)rhs.v, 1);
+		TEST_CUBLAS("cublasZdotc");
+		dot = complex<double>(cuCreal(_dot), cuCimag(_dot));
+	}
 #endif
-	complex<double> dot;
-	cblas_zdotc_sub(NN2, v, 1, rhs.v, 1, &dot);
 	return dot;
 }

-
-// norm of the matrix
+/***************************************************************************//**
+ * compute the Frobenius norm of this real double-precision symmetric  matrix
+ * @param[in] scalar subtract this scalar value from the diagonal elements before the norm computation
+ ******************************************************************************/
 template<>
-const double  NRSMat<double>::norm(const double scalar) const
-{
-	if (!scalar) return cblas_dnrm2(NN2, v, 1);
-	double sum = 0;
-	int k = 0;
-	for (int i=0; i<nn; ++i)
-		for (int j=0; j<=i; ++j) {
-			register double tmp;
-			tmp = v[k++];
-			if (i == j) tmp -= scalar;
+const double NRSMat<double>::norm(const double scalar) const {
+	if(!scalar){
+		double ret(0.);
+#ifdef CUDALA
+		if(location == cpu){
+#endif
+			ret = cblas_dnrm2(NN2, v, 1);
+#ifdef CUDALA
+		}else{
+			ret = cublasDnrm2(NN2, v, 1);
+			TEST_CUBLAS("cublasDnrm2");
+		}
+#endif
+		return ret;
+	}
+
+	NOT_GPU(*this);
+
+	double sum(0.);
+	int k(0);
+	for(register int i=0; i<nn; ++i){
+		for(register int j=0; j<=i; ++j) {
+			register double tmp = v[k++];
+			if(i == j) tmp -= scalar;
 			sum += tmp*tmp;
 		}
+	}
 	return std::sqrt(sum);
 }

-
-
+/***************************************************************************//**
+ * compute the Frobenius norm of this complex double-precision hermitian  matrix
+ * @param[in] scalar subtract this scalar value from the diagonal elements before the norm computation
+ ******************************************************************************/
 template<>
-const double NRSMat< complex<double> >::norm(const complex<double> scalar) const
-{
-	if (!(scalar.real()) && !(scalar.imag()))
-		return cblas_dznrm2(NN2, v, 1);
-	double sum = 0;
+const double NRSMat< complex<double> >::norm(const complex<double> scalar) const {
+	if(!(scalar.real()) && !(scalar.imag())){
+		double ret(0.);
+#ifdef CUDALA
+		if(location == cpu){
+#endif
+			ret = cblas_dznrm2(NN2, v, 1);
+#ifdef CUDALA
+		}else{
+			ret = cublasDznrm2(NN2, (cuDoubleComplex*)v, 1);
+			TEST_CUBLAS("cublasDznrm2");
+		}
+#endif
+		return ret;
+	}
+
+	int k(0);
+	double sum(0.);
 	complex<double> tmp;
-	int k = 0;
-	for (int i=0; i<nn; ++i)
-		for (int j=0; j<=i; ++j) {
+
+	for(register int i=0; i<nn; ++i){
+		for(register int j=0; j<=i; ++j){
 			tmp = v[k++];
 			if (i == j) tmp -= scalar;
 			sum += tmp.real()*tmp.real() + tmp.imag()*tmp.imag();
 		}
+	}
 	return std::sqrt(sum);
 }


-
-
-
-// axpy: S = S * a
+/***************************************************************************//**
+ * for this real double-precision symmetric matrix \f$S\f$ stored in packed form,
+ * real scalar value \f$\alpha\f$ and real double-precision symmetric matrix \f$T\f$, compute
+ * \f[S \leftarrow \alpha T + S\f]
+ ******************************************************************************/
 template<>
-void NRSMat<double>::axpy(const double alpha, const NRSMat<double> & x)
-{
+void NRSMat<double>::axpy(const double alpha, const NRSMat<double> &x) {
 #ifdef DEBUG
-	if (nn != x.nn) laerror("axpy of incompatible SMats");
+	if(nn != x.nn) laerror("incompatible dimensions in void NRSMat<double>::axpy(const double, const NRSMat<double>&)");
 #endif
+	SAME_LOC(*this, x);
 	copyonwrite();
-	cblas_daxpy(NN2, alpha, x.v, 1, v, 1);
+#ifdef CUDALA
+	if(location == cpu){
+#endif
+		cblas_daxpy(NN2, alpha, x.v, 1, v, 1);
+#ifdef CUDALA
+	}else{
+		cublasDaxpy(NN2, alpha, x.v, 1, v, 1);
+		TEST_CUBLAS("cublasDaxpy");
+	}
+#endif
 }


-
+/***************************************************************************//**
+ * for this complex double-precision hermitian matrix \f$H\f$ stored in packed form,
+ * complex scalar value \f$\alpha\f$ and complex double-precision hermitian matrix \f$G\f$, compute
+ * \f[H \leftarrow \alpha G + H\f]
+ ******************************************************************************/
 template<>
-void NRSMat< complex<double> >::axpy(const complex<double> alpha,
-			const NRSMat< complex<double> > & x)
-{
+void NRSMat<complex<double> >::axpy(const complex<double> alpha, const NRSMat<complex<double> > & x) {
 #ifdef DEBUG
-	if (nn != x.nn) laerror("axpy of incompatible SMats");
+	if(nn != x.nn) laerror("incompatible dimensions in void NRSMat<complex<double> >::axpy(const complex<double> , const NRSMat<complex<double> >&)");
 #endif
+	SAME_LOC(*this, x);
 	copyonwrite();
-	cblas_zaxpy(nn, &alpha, x.v, 1, v, 1);
+#ifdef CUDALA
+	if(location == cpu){
+#endif
+		cblas_zaxpy(nn, &alpha, x.v, 1, v, 1);
+#ifdef CUDALA
+	}else{
+		const cuDoubleComplex _alpha = make_cuDoubleComplex(alpha.real(), alpha.imag());
+		cublasZaxpy(NN2, _alpha, (cuDoubleComplex*)x.v, 1, (cuDoubleComplex*)v, 1);
+		TEST_CUBLAS("cublasZaxpy");
+	}
+#endif
+	
 }

-//complex from real
+/***************************************************************************//**
+ * create hermitian matrix \f$H\f$ from given real double-precision symmetric
+ * matrix \f$S\f$
+ * @param[in] rhs real double-precision symmetric matrix \f$S\f$
+ * @param[in] imagpart flag determining whether \f$S\f$ should correspond to the real or imaginary part of \f$H\f$
+ ******************************************************************************/
 template<>
-NRSMat<complex<double> >::NRSMat(const NRSMat<double> &rhs, bool imagpart)
-: nn(rhs.nrows()),  v(new complex<double>[rhs.nrows()*(rhs.nrows()+1)/2]), count(new int(1))
-{
-memset(v,0,nn*(nn+1)/2*sizeof(complex<double>));
-cblas_dcopy(nn*(nn+1)/2,&rhs(0,0),1,((double *)v) + (imagpart?1:0),2);
+NRSMat<complex<double> >::NRSMat(const NRSMat<double> &rhs, bool imagpart): nn(rhs.nrows()), count(new int(1)) {
+	//inconsistent in general case?
+	const int nnp1 = nn*(nn + 1)/2;
+#ifdef CUDALA
+	location = rhs.getlocation();
+	if(location == cpu){
+#endif
+		v = new complex<double>[nnp1];
+		memset(v, 0, nnp1*sizeof(complex<double>));
+		cblas_dcopy(nnp1, &rhs(0, 0), 1, ((double *)v) + (imagpart?1:0), 2);
+#ifdef CUDALA
+	}else{
+		v = (complex<double>*) gpualloc(nnp1*sizeof(complex<double>));
+
+		complex<double> *_val = gpuputcomplex(CZERO);
+		cublasZcopy(nnp1, (cuDoubleComplex*)_val, 0, (cuDoubleComplex*)v, 1);
+		TEST_CUBLAS("cublasZcopy");
+		gpufree(_val);
+
+		cublasDcopy(nnp1, (double*)(&rhs(0,0)), 1, ((double*)v) + (imagpart?1:0), 2);
+		TEST_CUBLAS("cublasDcopy"); 
+	}
+#endif
 }

-
-//some template specializations leading to BLAS/CUBLAS calls
-
-
-
-
-//////////////////////////////////////////////////////////////////////////////
-////// forced instantization in the corresponding object file
+/***************************************************************************//**
+ * forced instantization in the corresponding object file
+ ******************************************************************************/
 template class NRSMat<double>;
-template class NRSMat< complex<double> >;
+template class NRSMat<complex<double> >;

 template class NRSMat<long long>;
 template class NRSMat<long>;