*** empty log message ***

2010-06-25 15:28:19 +00:00
parent eb0aaf9adf
commit 074c943862
13 changed files with 1938 additions and 464 deletions
--- a/cuda_la.cc
+++ b/cuda_la.cc
@@ -1,2 +1,70 @@
+#include "la_traits.h"
 #include "cuda_la.h"

+#ifdef CUDALA
+
+namespace LA {
+
+GPUID DEFAULT_LOC = cpu;
+
+void set_default_loc(const GPUID loc) 
+{
+DEFAULT_LOC = loc;
+}
+
+void *gpualloc(size_t size)
+{
+cublasStatus status;
+void *ptr=NULL;
+status = cublasAlloc(size,1,&ptr);
+if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc");
+return ptr;
+}
+
+
+void gpufree(void *ptr)
+{
+cublasStatus status = cublasFree(ptr);
+if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasFree");
+}
+
+void gpuget(size_t n, size_t elsize, const void *from, void *to)
+{
+cublasStatus status;
+status=cublasGetVector(n,elsize,from,1,to,1);
+if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasGetVector");
+}
+
+void gpuput(size_t n, size_t elsize, const void *from, void *to)
+{
+cublasStatus status;
+status=cublasSetVector(n,elsize,from,1,to,1);
+if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector");
+}
+
+double *gpuputdouble(const double &x)
+{
+cublasStatus status;
+void *ptr=NULL;
+status = cublasAlloc(1,sizeof(double),&ptr);
+if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc");
+status=cublasSetVector(1,sizeof(double),&x,1,ptr,1);
+if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector");
+return (double *)ptr;
+}
+
+complex<double> *gpuputcomplex(const complex<double> &x)
+{
+cublasStatus status;
+void *ptr=NULL;
+status = cublasAlloc(1,sizeof(complex<double>),&ptr);
+if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc");
+status=cublasSetVector(1,sizeof(complex<double>),&x,1,ptr,1);
+if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector");
+return (complex<double> *)ptr;
+}
+
+
+
+}
+#endif
--- a/cuda_la.h
+++ b/cuda_la.h
@@ -0,0 +1,56 @@
+#ifndef _CUDA_LA_H
+#define _CUDA_LA_H
+
+#ifdef CUDALA
+#undef MATPTR
+#include "cublas.h"
+#endif
+
+#include "la_traits.h"
+
+namespace LA {
+
+#ifdef CUDALA
+#define NOT_GPU(x) {if((x).getlocation()!=cpu) laerror("Operation not implemented on GPU (yet). Use .moveto(0) first.");}
+#define SAME_LOC(x,y) {if((x).getlocation()!=(y).getlocation()) laerror("Operands have different location. Use .moveto() first.");}
+#define SAME_LOC3(x,y,z) {if((x).getlocation()!=(y).getlocation() || (x).getlocation()!=(z).getlocation()) laerror("Operands have different location. Use .moveto() first.");}
+#else
+#define NOT_GPU(x) {}
+#define SAME_LOC(x,y) {}
+#define SAME_LOC3(x,y,z) {}
+#endif
+
+typedef enum {undefined=-1, cpu=0, gpu1=1, gpu2=2, gpu3=3, gpu4=4} GPUID;
+
+#ifdef CUDALA
+
+//global static instantiation of this class will provide automatic init/shutdown of GPU
+class GPU_START {
+public:
+	GPU_START(void)
+		{
+		cublasStatus status = cublasInit();
+		if (status != CUBLAS_STATUS_SUCCESS) laerror("Cannot init GPU for CUBLAS");
+		}
+	~GPU_START(void)
+		{
+		cublasStatus status = cublasShutdown();
+		if (status != CUBLAS_STATUS_SUCCESS) laerror("Cannot cleanly shutdown GPU");
+                }
+};
+
+extern void *gpualloc(size_t size);
+extern void gpufree(void *ptr);
+extern void gpuget(size_t n, size_t elsize, const void *from, void *to);
+extern void gpuput(size_t n, size_t elsize, const void *from, void *to);
+extern double *gpuputdouble(const double &x);
+extern complex<double> *gpuputcomplex(const complex<double> &x);
+
+void set_default_loc(const GPUID loc);
+
+extern GPUID DEFAULT_LOC;
+
+
+#endif
+}
+#endif
--- a/la_traits.h
+++ b/la_traits.h
@@ -40,6 +40,8 @@

 #include "laerror.h"

+#include "cuda_la.h"
+
 #ifdef NONCBLAS
 #include "noncblas.h"
 #else
@@ -205,6 +207,8 @@ struct LA_traits_aux<complex<C>, scalar_true> {
 typedef complex<C> elementtype;
 typedef complex<C> producttype;
 typedef C normtype;
+typedef C realtype;
+typedef complex<C> complextype;
 static inline C sqrabs(const complex<C> x) { return x.real()*x.real()+x.imag()*x.imag();}
 static inline bool gencmp(const complex<C> *x, const complex<C> *y, int n) {return memcmp(x,y,n*sizeof(complex<C>));}
 static bool bigger(const  complex<C> &x, const complex<C> &y) {laerror("complex comparison undefined"); return false;}
@@ -230,6 +234,8 @@ struct LA_traits_aux<C, scalar_true> {
 typedef C elementtype;
 typedef C producttype;
 typedef C normtype;
+typedef C realtype;
+typedef complex<C> complextype;
 static inline C sqrabs(const C x) { return x*x;}
 static inline bool gencmp(const C *x, const C *y, int n) {return memcmp(x,y,n*sizeof(C));}
 static inline bool bigger(const  C &x, const C &y) {return x>y;}
@@ -261,6 +267,8 @@ struct LA_traits_aux<X<C>, scalar_false> { \
 typedef C elementtype; \
 typedef X<C> producttype; \
 typedef typename LA_traits<C>::normtype normtype; \
+typedef X<typename LA_traits<C>::realtype> realtype; \
+typedef X<typename LA_traits<C>::complextype> complextype; \
 static bool gencmp(const C *x, const C *y, int n) {for(int i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \
 static inline bool bigger(const  C &x, const C &y) {return x>y;} \
 static inline bool smaller(const  C &x, const C &y) {return x<y;} \
@@ -293,6 +301,8 @@ struct LA_traits_aux<X<C>, scalar_false> {  \
 typedef C elementtype;  \
 typedef NRMat<C> producttype;  \
 typedef typename LA_traits<C>::normtype normtype;  \
+typedef X<typename LA_traits<C>::realtype> realtype; \
+typedef X<typename LA_traits<C>::complextype> complextype; \
 static bool gencmp(const C *x, const C *y, int n) {for(int i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \
 static inline bool bigger(const  C &x, const C &y) {return x>y;} \
 static inline bool smaller(const  C &x, const C &y) {return x<y;} \
--- a/laerror.cc
+++ b/laerror.cc
@@ -25,6 +25,7 @@
 #include <errno.h>
 #include <stdarg.h>

+#include "cuda_la.h"

 #ifdef USE_TRACEBACK
 #include "traceback.h"
@@ -32,6 +33,11 @@

 namespace LA {

+//enforce GPU initialization by a global class instantization constructor
+#ifdef CUDALA
+GPU_START gpu_start_instant;
+#endif
+
 bool _LA_count_check=true;

 extern "C" void _findme(void) {}; //for autoconf test we need a function with C linkage
@@ -45,6 +51,13 @@ void laerror(const char *s1)
    std::cerr << s1  << "\n";
    std::cout << s1  << "\n";
  }
+#ifdef CUDALA
+{
+cublasStatus s=cublasGetError();
+std::cerr << "CUBLAS status = "<<s<<std::endl;
+std::cout << "CUBLAS status = "<<s<<std::endl;
+}
+#endif
  if(errno) perror("system error");

 throw LAerror(s1); 
--- a/mat.cc
+++ b/mat.cc
@@ -122,6 +122,15 @@ return r;
 template <typename T>
 void NRMat<T>::put(int fd, bool dim, bool transp) const
 {
+#ifdef CUDALA
+if(location!=cpu)
+	{
+	NRMat<T> tmp= *this;
+	tmp.moveto(cpu);
+	tmp.put(fd,dim,transp);
+	return;
+	}
+#endif
 errno=0;
 if(dim)
 {
@@ -153,6 +162,17 @@ else LA_traits<T>::multiput(nn*mm,fd,
 template <typename T>
 void NRMat<T>::get(int fd, bool dim, bool transp)
 {
+#ifdef CUDALA
+if(location!=cpu)
+        {
+        NRMat<T> tmp;
+	tmp.moveto(cpu);
+        tmp.get(fd,dim,transp);
+        tmp.moveto(location);
+	*this = tmp;
+        return;
+        }
+#endif
 int nn0,mm0;
 errno=0;
 if(dim)
@@ -188,6 +208,43 @@ else LA_traits<T>::multiget(nn*mm,fd,


 // Assign diagonal
+template <>
+NRMat<double> & NRMat<double>::operator=(const double &a)
+{
+        copyonwrite();
+#ifdef DEBUG
+        if (nn != mm) laerror("RMat.operator=scalar on non-square matrix");
+#endif
+#ifdef CUDALA
+        if(location==cpu)
+	{
+#endif
+#ifdef MATPTR
+         memset(v[0],0,nn*nn*sizeof(double));
+         for (int i=0; i< nn; i++) v[i][i] = a;
+#else
+	 double n=0.;
+	 cblas_dcopy(nn*nn, &n, 0, v, 1);
+	 cblas_dcopy(nn, &a, 0, v, nn+1);
+#endif
+#ifdef CUDALA
+	}
+	else
+	{
+	double *d=gpuputdouble(0.);
+        cublasDcopy(nn*nn, d, 0, v, 1);
+	gpufree(d);
+	d=gpuputdouble(a);
+        cublasDcopy(nn, d, 0, v, nn+1);
+	gpufree(d);
+	}
+#endif
+         return *this;
+}
+
+
+
+
 template <typename T>
 NRMat<T> & NRMat<T>::operator=(const T &a)
 {
@@ -206,6 +263,64 @@ NRMat<T> & NRMat<T>::operator=(const T &a)
 }


+template <>
+NRMat<double> & NRMat<double>::operator+=(const double&a)
+{
+        copyonwrite();
+#ifdef DEBUG
+        if (nn != mm) laerror("Mat.operator+=scalar on non-square matrix");
+#endif
+#ifdef CUDALA
+	if(location==cpu)
+	{
+#endif
+#ifdef MATPTR
+        for (int i=0; i< nn; i++) v[i][i] += a;
+#else
+	cblas_daxpy(nn, 1.0, &a, 0, *this, nn+1);
+#endif
+#ifdef CUDALA
+	}
+	else
+	{
+	double *d=gpuputdouble(a);
+	cublasDaxpy(nn, 1.0, d, 0, *this, nn+1);
+	gpufree(d);
+	}
+#endif
+        return *this;
+}
+
+
+template <>
+NRMat<double> & NRMat<double>::operator-=(const double&a)
+{
+        copyonwrite();
+#ifdef DEBUG
+        if (nn != mm) laerror("Mat.operator+=scalar on non-square matrix");
+#endif
+#ifdef CUDALA
+	if(location==cpu)
+	{
+#endif
+#ifdef MATPTR
+        for (int i=0; i< nn; i++) v[i][i] -= a;
+#else
+	cblas_daxpy(nn, -1.0, &a, 0, *this, nn+1);
+#endif
+#ifdef CUDALA
+	}
+	else
+	{
+        double *d=gpuputdouble(a);
+        cublasDaxpy(nn, -1.0, d, 0, *this, nn+1);
+        gpufree(d);
+	}
+#endif
+        return *this;
+}
+
+


 // M += a
@@ -240,6 +355,31 @@ NRMat<T> & NRMat<T>::operator-=(const T &a)
 	return *this;
 }

+template <>
+const NRMat<double> NRMat<double>::operator-() const
+{
+        NRMat<double> result(nn, mm);
+#ifdef CUDALA
+        if(location==cpu)
+	{
+#endif
+#ifdef MATPTR
+        for (int i=0; i<nn*mm; i++) result.v[0][i]= -v[0][i];
+#else
+	cblas_dscal(nn*mm, -1., v, 1);
+#endif
+#ifdef CUDALA
+        }
+	else
+	{
+	cublasDscal(nn*mm, -1., v, 1);
+	}
+#endif
+        return result;
+}
+
+
+
 // unary minus
 template <typename T>
 const NRMat<T> NRMat<T>::operator-() const
@@ -253,6 +393,7 @@ const NRMat<T> NRMat<T>::operator-() const
 	return result;
 }

+
 // direct sum
 template <typename T>
 const NRMat<T> NRMat<T>::operator&(const NRMat<T> & b) const
@@ -540,7 +681,13 @@ template<>
 NRMat<double> & NRMat<double>::operator*=(const double &a)
 {
 	copyonwrite();
-	cblas_dscal(nn*mm, a, *this, 1);
+#ifdef CUDALA
+	if(location==cpu)
+#endif
+		cblas_dscal(nn*mm, a, *this, 1);
+#ifdef CUDALA
+	else 	cublasDscal(nn*mm, a, v, 1);
+#endif
 	return *this;
 }

@@ -559,6 +706,7 @@ NRMat< complex<double> >::operator*=(const complex<double> &a)
 template <typename T>
 NRMat<T> & NRMat<T>::operator*=(const T &a)
 {
+NOT_GPU(*this);
        copyonwrite();
 #ifdef MATPTR
         for (int i=0; i< nn*mm; i++) v[0][i] *= a;
@@ -578,8 +726,16 @@ NRMat<double> & NRMat<double>::operator+=(const NRMat<double>  &rhs)
 	if (nn != rhs.nn || mm!= rhs.mm) 
 		laerror("Mat += Mat of incompatible matrices");
 #endif
+SAME_LOC(*this,rhs);
 	copyonwrite();
+#ifdef CUDALA
+	if(location==cpu)
+#endif
 	cblas_daxpy(nn*mm, 1.0, rhs, 1, *this, 1);
+#ifdef CUDALA
+	else 
+	cublasDaxpy(nn*mm, 1.0, rhs, 1, v, 1);
+#endif
 	return *this;
 }

@@ -625,8 +781,16 @@ NRMat<double> & NRMat<double>::operator-=(const NRMat<double>  &rhs)
 	if (nn != rhs.nn || mm!= rhs.mm) 
 		laerror("Mat -= Mat of incompatible matrices");
 #endif
+	SAME_LOC(*this,rhs);
 	copyonwrite();
+#ifdef CUDALA
+	if(location==cpu)
+#endif
 	cblas_daxpy(nn*mm, -1.0, rhs, 1, *this, 1);
+#ifdef CUDALA
+	else
+	cublasDaxpy(nn*mm, -1.0, rhs, 1, v, 1);
+#endif
 	return *this;
 }

@@ -836,9 +1000,18 @@ const NRMat<double> NRMat<double>::operator*(const NRMat<double> &rhs) const
 	if (mm != rhs.nn) laerror("product of incompatible matrices");
 	if (rhs.mm <=0) laerror("illegal matrix dimension in gemm");
 #endif
-	NRMat<double> result(nn, rhs.mm);
+SAME_LOC(*this,rhs);
+
+	NRMat<double> result(nn, rhs.mm,rhs.getlocation());
+#ifdef CUDALA
+        if(location==cpu)
+#endif
 	cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, nn, rhs.mm, mm, 1.0,
 			*this, mm, rhs, rhs.mm, 0.0, result, rhs.mm);
+#ifdef CUDALA
+        else
+	cublasDgemm('N','N',rhs.mm,nn,mm,1.0,rhs, rhs.mm,*this, mm, 0.0, result, rhs.mm);
+#endif
 	return result;
 }

@@ -991,12 +1164,21 @@ void NRMat<double>::gemm(const double &beta, const NRMat<double> &a,
 	if (l!=nn || ll!=mm || k!=kk) laerror("incompatible matrices in Mat:gemm()");
 	if(b.mm <=0 || mm<=0) laerror("illegal matrix dimension in gemm");
 #endif
+SAME_LOC3(*this,a,b);
+
 	if (alpha==0.0 && beta==1.0) return;

 	copyonwrite();
+#ifdef CUDALA
+	if(location==cpu)
+#endif
 	cblas_dgemm(CblasRowMajor, (transa=='n' ? CblasNoTrans : CblasTrans),
 			(transb=='n' ? CblasNoTrans : CblasTrans), nn, mm, k, alpha, a,
 			a.mm, b , b.mm, beta, *this , mm);
+#ifdef CUDALA
+        else
+	cublasDgemm(transb,transa,mm,nn,k,alpha, b , b.mm, a,a.mm, beta, *this , mm);
+#endif
 }


@@ -1028,7 +1210,20 @@ void NRMat< complex<double> >::gemm(const complex<double> & beta,
 template<>
 const double  NRMat<double>::norm(const double scalar) const
 {
-	if (!scalar) return cblas_dnrm2(nn*mm, (*this)[0], 1);
+	if (!scalar)  
+		{
+#ifdef CUDALA
+		if(location==cpu)
+#endif
+		return cblas_dnrm2(nn*mm, (*this)[0], 1);
+#ifdef CUDALA
+		else
+		return cublasDnrm2(nn*mm, v, 1);
+#endif
+		}
+
+NOT_GPU(*this);
+
 	double sum = 0;
 	for (int i=0; i<nn; i++)
 		for (int j=0; j<mm; j++) {
@@ -1246,6 +1441,127 @@ else //vectors are columns



+//------------------------------------------------------------------------------
+//	for a matrix A(1:nn,1:mm) performs Fortran-like 
+//	operation A(nn:-1:1,:) 
+//------------------------------------------------------------------------------
+template<>
+NRMat<double>& NRMat<double>::SwapRows(){
+        copyonwrite();
+        const int n_pul = this->nn / 2;
+        double * const dataIn  = this->v;
+
+        for(register int i=0; i<n_pul; i++){
+                cblas_dswap(mm, dataIn + i*mm, 1, dataIn + (nn-i-1)*mm, 1);
+        }
+
+        return *this;
+}
+//------------------------------------------------------------------------------
+template<>
+NRMat<complex<double> >& NRMat<complex<double> >::SwapRows(){
+        copyonwrite();
+	const int n = this->nn;
+	const int m = this->mm;
+        const int n_pul = this->nn / 2;
+        complex<double> * const dataIn  = this->v;
+
+        for(register int i=0; i<n_pul; i++){
+                cblas_zswap(m, dataIn + i*m, 1, dataIn + (n-i-1)*m, 1);
+        }
+
+        return *this;
+}
+//------------------------------------------------------------------------------
+template<typename T>
+NRMat<T>& NRMat<T>::SwapRows(){
+        copyonwrite();
+	const int n = this->nn;
+	const int m = this->mm;
+        const int n_pul = this->nn / 2;
+        T * const dataIn  = this->v;
+
+        for(register int i=0; i<n_pul; i++){
+		const int offset1 = i*m;
+		const int offset2 = (n-i-1)*m;
+
+		for(register int j=0;j<m;j++){
+			dataIn[offset1 + j] = dataIn[offset2 + j];
+		}
+        }
+        return *this;
+}
+//------------------------------------------------------------------------------
+//	for a matrix A(1:nn,1:mm) performs Fortran-like 
+//	operation A(:,mm:-1:1) 
+//------------------------------------------------------------------------------
+template<>
+NRMat<double>& NRMat<double>::SwapCols(){
+        copyonwrite();
+	const int n = this->nn;
+	const int m = this->mm;
+        const int m_pul = m / 2;
+        double * const dataIn  = this->v;
+
+        for(register int i=0; i<m_pul; i++){
+                cblas_dswap(n, dataIn + i, m, dataIn + (m-i-1), m);
+        }
+
+        return *this;
+}
+//------------------------------------------------------------------------------
+template<>
+NRMat<complex<double> >& NRMat<complex<double> >::SwapCols(){
+        copyonwrite();
+        const int n_pul = this->nn / 2;
+        const int m_pul = this->mm / 2;
+        complex<double> * const dataIn  = this->v;
+
+        for(register int i=0; i<m_pul; i++){
+                cblas_zswap(nn, dataIn + i, mm, dataIn + (mm-i-1), mm);
+        }
+        return *this;
+}
+//------------------------------------------------------------------------------
+template<typename T>
+NRMat<T>& NRMat<T>::SwapCols(){
+        copyonwrite();
+        const int n_pul = nn / 2;
+        const int m_pul = mm / 2;
+        T * const dataIn  = this->v;
+
+        for(register int i=0; i<m_pul; i++){
+		for(register int j=0;j<nn;j++){
+			const int jm = j*mm;
+			dataIn[i + jm] = dataIn[(mm-i-1) + jm];
+		}
+        }
+        return *this;
+}
+//------------------------------------------------------------------------------
+//	for a matrix A(1:nn,1:mm) performs Fortran-like 
+//	operation A(nn:-1:1,mm:-1:1)
+//------------------------------------------------------------------------------
+template<typename T>
+NRMat<T>& NRMat<T>::SwapRowsCols(){
+        this->copyonwrite();
+        const int n = this->nn;
+        const int m = this->mm;
+        T * const dataIn  = this->v;
+        T * const dataOut = this->v;
+        
+	const int Dim = n*m;
+        for(register int i=0;i<n;i++){
+                const int off = i*n;
+                for(register int j=0;j<m;j++){
+                        const int offset = off + j;
+                        dataOut[Dim - (offset + 1)] = dataIn[offset];
+                }
+        }
+
+        return *this;
+}
+//------------------------------------------------------------------------------


 //////////////////////////////////////////////////////////////////////////////
--- a/mat.h
+++ b/mat.h
@@ -33,12 +33,20 @@ protected:
 	T *v;
 #endif
 	int *count;
+#ifdef CUDALA
+        GPUID location;
+#endif
 public:
 	friend class NRVec<T>;
 	friend class NRSMat<T>;
 	
-	inline NRMat() : nn(0), mm(0), v(0), count(0) {};
-	inline NRMat(const int n, const int m);
+	inline NRMat() : nn(0), mm(0), v(0), count(0) 
+			{
+#ifdef CUDALA
+			location = DEFAULT_LOC;
+#endif
+			};
+	inline NRMat(const int n, const int m ,const GPUID loc= undefined);
 	inline NRMat(const T &a, const int n, const int m);
 	NRMat(const T *a, const int n, const int m);
 	inline NRMat(const NRMat &rhs);
@@ -57,6 +65,13 @@ public:
 #endif
 	const bool operator==(const NRMat &rhs) const {return !(*this != rhs);};
 	inline int getcount() const {return count?*count:0;}
+#ifdef CUDALA
+	inline GPUID getlocation() const {return location;}
+	void moveto(const GPUID dest);
+#else
+	inline GPUID getlocation() const {return cpu;}
+	void moveto(const GPUID dest) {};
+#endif
 	NRMat & operator=(const NRMat &rhs);  //assignment
 	void randomize(const typename  LA_traits<T>::normtype &x); //fill with random numbers
 	NRMat & operator=(const T &a);    //assign a to diagonal
@@ -88,7 +103,7 @@ public:
 	const NRMat operator*(const NRSMat<T> &rhs) const; // Mat * Smat
 	const NRMat operator&(const NRMat &rhs) const; // direct sum
 	const NRMat operator|(const NRMat<T> &rhs) const; // direct product
-	const NRVec<T> operator*(const NRVec<T> &rhs) const {NRVec<T> result(nn); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
+	const NRVec<T> operator*(const NRVec<T> &rhs) const {NRVec<T> result(nn,rhs.getlocation()); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
 	const NRVec<complex<T> > operator*(const NRVec<complex<T> > &rhs) const {NRVec<complex<T> > result(nn); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
 	const NRVec<T> rsum() const; //sum of rows
 	const NRVec<T> csum() const; //sum of columns
@@ -157,9 +172,14 @@ public:
 namespace LA {
 // ctors
 template <typename T>
-NRMat<T>::NRMat(const int n, const int m) : nn(n), mm(m), count(new int)
+NRMat<T>::NRMat(const int n, const int m, const GPUID loc) : nn(n), mm(m), count(new int)
 {
 	*count = 1;
+#ifdef CUDALA
+        location= (loc==undefined?DEFAULT_LOC:loc);
+        if(location==cpu)
+	{
+#endif
 #ifdef MATPTR
 	v = new T*[n];
 	v[0] = new T[m*n];
@@ -167,14 +187,29 @@ NRMat<T>::NRMat(const int n, const int m) : nn(n), mm(m), count(new int)
 #else
 	v = new T[m*n];
 #endif
+#ifdef CUDALA
+	}
+        else
+	{
+	v= (T*) gpualloc(n*m*sizeof(T));
+	}
+#endif
 }

 template <typename T>
 NRMat<T>::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new int)
 {
+#ifdef CUDALA
+        location=DEFAULT_LOC;
+#endif
+
 	int i;
 	T *p;
 	*count = 1;
+#ifdef CUDALA
+        if(location==cpu)
+	{
+#endif
 #ifdef MATPTR
 	v = new T*[n];
 	p = v[0] = new T[m*n];
@@ -186,12 +221,29 @@ NRMat<T>::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new
 		for (i=0; i< n*m; i++) *p++ = a;
 	else
 		memset(p, 0, n*m*sizeof(T));
+#ifdef CUDALA
+	}
+	else
+	{
+	v= (T*) gpualloc(n*m*sizeof(T));
+	cublasSetVector(n*m,sizeof(T),&a,0,v,1);
+	}
+#endif
+
 }

 template <typename T>
 NRMat<T>::NRMat(const T *a, const int n, const int m) : nn(n), mm(m), count(new int)
 {
+#ifdef CUDALA
+        location=DEFAULT_LOC;
+#endif
+
 	*count = 1;
+#ifdef CUDALA
+        if(location==cpu)
+        {
+#endif
 #ifdef MATPTR
 	v = new T*[n];
 	v[0] = new T[m*n];
@@ -201,11 +253,25 @@ NRMat<T>::NRMat(const T *a, const int n, const int m) : nn(n), mm(m), count(new
 	v = new T[m*n];
 	memcpy(v, a, n*m*sizeof(T));
 #endif
+#ifdef CUDALA
+        }
+        else
+        {
+        v= (T*) gpualloc(n*m*sizeof(T));
+        cublasSetVector(n*m,sizeof(T),a,1,v,1);
+        }
+#endif
+
 }

+
+//copy constructor
 template <typename T>
 NRMat<T>::NRMat(const NRMat &rhs)
 {
+#ifdef CUDALA
+        location=rhs.location;
+#endif
 	nn = rhs.nn;
 	mm = rhs.mm;
 	count = rhs.count;
@@ -213,9 +279,16 @@ NRMat<T>::NRMat(const NRMat &rhs)
 	if (count) ++(*count);
 }

+
 template <typename T>
 NRMat<T>::NRMat(const NRSMat<T> &rhs)
 {
+NOT_GPU(rhs); 
+
+#ifdef CUDALA
+	location=rhs.location;
+#endif
+
 	int i;
 	nn = mm = rhs.nrows();
 	count = new int;
@@ -244,6 +317,10 @@ NRMat<T>::NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset)
 {
 	if (offset < 0 || n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");

+#ifdef CUDALA
+location=rhs.location;
+#endif
+
 	nn = n;
 	mm = m;
 	count = rhs.count;
@@ -303,6 +380,7 @@ inline T & NRMat<T>::operator()(const int i, const int j)
 	if (_LA_count_check && *count != 1) laerror("Mat lval use of (,) with count > 1");
 	if (i<0 || i>=nn &&nn>0 || j<0 || j>=mm && mm>0) laerror("Mat (,) out of range");
 	if (!v) laerror("(,) for unallocated Mat");
+NOT_GPU(*this);
 #endif
 #ifdef MATPTR
 	return v[i][j];
@@ -310,12 +388,14 @@ inline T & NRMat<T>::operator()(const int i, const int j)
 	return v[i*mm+j];
 #endif
 }
+
 template <typename T>
 inline const T & NRMat<T>::operator()(const int i, const int j) const
 {
 #ifdef DEBUG
 	if (i<0 || i>=nn&&nn>0 || j<0 || j>=mm&& mm>0) laerror("Mat (,) out of range");
 	if (!v) laerror("(,) for unallocated Mat");
+NOT_GPU(*this); //in principle we could copy the element to CPU memory, yielding, however, a highly inneficient contruct 
 #endif
 #ifdef MATPTR
 	return v[i][j];
@@ -391,7 +471,7 @@ inline const complex<double>  NRMat< complex<double> >::amax() const
 }


-//basi stuff to be available for any type ... must be in .h
+//basic stuff to be available for any type ... must be in .h
 // dtor
 template <typename T>
 NRMat<T>::~NRMat()
@@ -399,10 +479,21 @@ NRMat<T>::~NRMat()
        if (!count) return;
        if (--(*count) <= 0) {
                if (v) {
+#ifdef CUDALA
+		    if(location==cpu)
+#endif
+		    	{
 #ifdef MATPTR
                        delete[] (v[0]);
 #endif
                        delete[] v;
+		    	}
+#ifdef CUDALA
+		    else 
+			{
+			gpufree(v);
+			}
+#endif
                }
                delete count;
        }
@@ -415,14 +506,27 @@ NRMat<T> & NRMat<T>::operator=(const NRMat<T> &rhs)
        if (this !=&rhs)
 		{
        	if (count) 
-                    if (--(*count) ==0 ) {
+                    if (--(*count) ==0 ) 
+			{
+#ifdef CUDALA
+			if(location==cpu) 
+			{
+#endif
 #ifdef MATPTR
                        delete[] (v[0]);
 #endif
                        delete[] v;
+#ifdef CUDALA
+			}
+			else gpufree(v);
+#endif
+
                        delete count;
                	}
                v = rhs.v;
+#ifdef CUDALA
+                location=rhs.location;
+#endif
                nn = rhs.nn;
                mm = rhs.mm;
                count = rhs.count;
@@ -437,46 +541,8 @@ template <typename T>
 NRMat<T> & NRMat<T>::operator|=(const NRMat<T> &rhs)
 {
        if (this == &rhs) return *this;
-#ifdef DEBUG
-        if (!rhs.v) laerror("unallocated rhs in Mat operator |=");
-#endif
-        if (count)
-                if (*count > 1) {
-                        --(*count);
-                        nn = 0;
-                        mm = 0;
-                        count = 0;
-                        v = 0;
-                }
-        if (nn != rhs.nn || mm != rhs.mm) {
-                if (v) {
-#ifdef MATPTR
-                        delete[] (v[0]);
-#endif
-                        delete[] (v);
-                        v = 0;
-                }
-                nn = rhs.nn;
-                mm = rhs.mm;
-        }
-        if (!v) {
-#ifdef MATPTR
-                v = new T*[nn];
-                v[0] = new T[mm*nn];
-#else
-                v = new T[mm*nn];
-#endif
-        }
-#ifdef MATPTR
-        for (int i=1; i< nn; i++) v[i] = v[i-1] + mm;
-        memcpy(v[0], rhs.v[0], nn*mm*sizeof(T));
-#else
-        memcpy(v, rhs.v, nn*mm*sizeof(T));
-#endif
-
-        if (!count) count = new int;
-        *count = 1;
-
+	*this = rhs;
+	this->copyonwrite();
        return *this;
 }

@@ -486,9 +552,13 @@ void NRMat<T>::copyonwrite()
 {
        if (!count) laerror("Mat::copyonwrite of undefined matrix");
        if (*count > 1) {
-                (*count)--;
-                count = new int;
-                *count = 1;
+             (*count)--;
+             count = new int;
+             *count = 1;
+#ifdef CUDALA
+	     if(location==cpu) //matrix is in CPU memory
+	     {
+#endif
 #ifdef MATPTR
                T **newv = new T*[nn];
                newv[0] = new T[mm*nn];
@@ -499,10 +569,21 @@ void NRMat<T>::copyonwrite()
                T *newv = new T[mm*nn];
                memcpy(newv, v, mm*nn*sizeof(T));
                v = newv;
+#endif
+#ifdef CUDALA
+	     }
+          else //matrix is in GPU memory
+		{
+		T *newv = (T *) gpualloc(mm*nn*sizeof(T));
+		if(sizeof(T)%sizeof(float)!=0) laerror("cpu memcpy alignment problem");
+                cublasScopy(nn*mm*sizeof(T)/sizeof(float),(const float *) v,1,(float *)newv,1);
+                v = newv;
+		}
 #endif
        }
 }

+
 template <typename T>
 void NRMat<T>::resize(int n, int m)
 {
@@ -519,10 +600,18 @@ if(m==0) n=0;
 	        if(n==0 && m==0)
        		{
 		        if(--(*count) <= 0) {
+#ifdef CUDALA
+                           if(location==cpu)
+				{
+#endif
 #ifdef MATPTR
                		if(v) delete[] (v[0]);
 #endif
                		if(v) delete[] v;
+#ifdef CUDALA
+				}
+			    else gpufree(v);
+#endif
 		                delete count;
 		                }
 		        count=0;
@@ -543,6 +632,10 @@ if(m==0) n=0;
                *count = 1;
                nn = n;
                mm = m;
+#ifdef CUDALA
+                if(location==cpu) 
+		{
+#endif
 #ifdef MATPTR
                v = new T*[nn];
                v[0] = new T[m*n];
@@ -550,12 +643,22 @@ if(m==0) n=0;
 #else
                v = new T[m*n];
 #endif
+#ifdef CUDALA
+                }
+		else
+		v = (T *) gpualloc(n*m*sizeof(T));
+#endif
+
                return;
        }
        // At this point *count = 1, check if resize is necessary
        if (n!=nn || m!=mm) {
                nn = n;
                mm = m;
+#ifdef CUDALA
+             if(location==cpu)
+	     {
+#endif
 #ifdef MATPTR
                delete[] (v[0]);
 #endif
@@ -566,6 +669,14 @@ if(m==0) n=0;
                for (int i=1; i< n; i++) v[i] = v[i-1] + m;
 #else
                v = new T[m*n];
+#endif
+#ifdef CUDALA
+             }
+            else
+	     {
+             gpufree(v);
+             v=(T *) gpualloc(n*m*sizeof(T));
+	     }
 #endif
        }
 }
@@ -587,7 +698,11 @@ return r;
 // I/O
 template <typename T>
 std::ostream& operator<<(std::ostream &s, const NRMat<T> &x)
-                {
+{
+#ifdef CUDALA
+        if(x.getlocation()==cpu)
+		{
+#endif
                int i,j,n,m;
                n=x.nrows();
                m=x.ncols();
@@ -597,18 +712,43 @@ std::ostream& operator<<(std::ostream &s, const NRMat<T> &x)
                        for(j=0; j<m;j++) s << (typename LA_traits_io<T>::IOtype) x[i][j] << (j==m-1 ? '\n' : ' '); // endl cannot be used in the conditional expression, since it is an overloaded function
                        }
                return s;
-                }
+#ifdef CUDALA
+		}
+	else
+		{
+		NRMat<T> tmp=x;
+		tmp.moveto(cpu);
+		return s<<tmp;
+		}
+#endif
+}

 template <typename T>
 std::istream& operator>>(std::istream  &s, NRMat<T> &x)
+{
+#ifdef CUDALA
+        if(x.getlocation()==cpu)
                {
+#endif
                int i,j,n,m;
                s >> n >> m;
                x.resize(n,m);
 		typename LA_traits_io<T>::IOtype tmp;
                for(i=0;i<n;i++) for(j=0; j<m;j++) { s>>tmp; x[i][j]=tmp;}
                return s;
-                }
+#ifdef CUDALA
+		}
+	else
+		{
+		NRMat<T> tmp;
+		tmp.moveto(cpu);
+		s >> tmp;
+		tmp.moveto(x.getlocation());
+		x=tmp;
+		return s;
+		}
+#endif
+}


 //optional indexing from 1
@@ -671,6 +811,38 @@ NRMat<T> & NRMat<T>::operator^=(const NRMat<T>  &rhs){
 }


+#ifdef CUDALA
+template<typename T>
+void NRMat<T>::moveto(const GPUID dest)
+{
+if(location==dest) return;
+location=dest;
+
+if(v && !count) laerror("internal inconsistency of reference counting 1");
+if (!count) return;
+
+if(v && *count==0) laerror("internal inconsistency of reference counting 2");
+if(!v) return;
+
+T *vold = v;
+
+if(dest == cpu) //moving from GPU to CPU
+	{
+	v = new T[nn*mm];
+	gpuget(nn*mm,sizeof(T),vold,v);
+	if(*count == 1) gpufree(vold);
+	else {--(*count); count = new int(1);}
+	}
+else	//moving from CPU to GPU
+	{
+	v=(T *) gpualloc(nn*mm*sizeof(T));
+	gpuput(nn*mm,sizeof(T),vold,v);
+	if(*count == 1) delete[] vold;
+	else {--(*count); count = new int(1);}
+	}
+}
+#endif
+//end CUDALA



--- a/noncblas.cc
+++ b/noncblas.cc
@@ -20,106 +20,186 @@
 #include "noncblas.h"
 #include "laerror.h"
 #include "mat.h"
+#include "fortran.h"


-#ifdef FORTRAN_
-#define FORNAME(x) x##_
-#else
-#define FORNAME(x) x
-#endif
-
 #ifdef NONCBLAS
 //Level 1 - straightforward wrappers

-extern "C" double FORNAME(ddot) (const int *n, const double *x, const int *incx, const double *y, const int *incy);
+extern "C" double FORNAME(ddot) (const FINT *n, const double *x, const FINT *incx, const double *y, const FINT *incy);
 double cblas_ddot(const int N, const double *X, const int incX,
                  const double *Y, const int incY)
 {
+#ifdef FORINT
+const FINT ntmp=N;
+const FINT incxtmp=incX;
+const FINT incytmp=incY;
+return FORNAME(ddot)(&ntmp,X,&incxtmp,Y,&incytmp);
+#else
 return FORNAME(ddot)(&N,X,&incX,Y,&incY);
+#endif
 }

-extern "C" void FORNAME(dscal) (const int *n, const double *a, double *x, const int *incx);
+extern "C" void FORNAME(dscal) (const FINT *n, const double *a, double *x, const FINT *incx);
 void cblas_dscal(const int N, const double alpha, double *X, const int incX)
 {
+#ifdef FORINT
+const FINT ntmp=N;
+const FINT incxtmp=incX;
+FORNAME(dscal) (&ntmp,&alpha,X,&incxtmp);
+#else
 FORNAME(dscal) (&N,&alpha,X,&incX);
+#endif
 }

-extern "C" void FORNAME(dcopy) (const int *n, const double *x, const int *incx, double *y, const int *incy);
+extern "C" void FORNAME(dcopy) (const FINT *n, const double *x, const FINT *incx, double *y, const FINT *incy);
 void cblas_dcopy(const int N, const double *X, const int incX,
                 double *Y, const int incY)
 {
+#ifdef FORINT
+const FINT ntmp=N;
+const FINT incxtmp=incX;
+const FINT incytmp=incY;
+FORNAME(dcopy) (&ntmp,X,&incxtmp,Y,&incytmp);
+#else
 FORNAME(dcopy) (&N,X,&incX,Y,&incY);
+#endif
 }

-extern "C" void FORNAME(daxpy) (const int *n, const double *a, const double *x, const int *incx, double *y, const int *incy);
+extern "C" void FORNAME(daxpy) (const FINT *n, const double *a, const double *x, const FINT *incx, double *y, const FINT *incy);
 void cblas_daxpy(const int N, const double alpha, const double *X,
                 const int incX, double *Y, const int incY)
 {
+#ifdef FORINT
+const FINT ntmp=N;
+const FINT incxtmp=incX;
+const FINT incytmp=incY;
+FORNAME(daxpy) (&ntmp,&alpha,X,&incxtmp,Y,&incytmp);
+#else
 FORNAME(daxpy) (&N,&alpha,X,&incX,Y,&incY);
+#endif
 }

-extern "C" double FORNAME(dnrm2) (const int *n, const double *x, const int *incx);
+extern "C" double FORNAME(dnrm2) (const FINT *n, const double *x, const FINT *incx);
 double cblas_dnrm2(const int N, const double *X, const int incX)
 {
+#ifdef FORINT
+const FINT ntmp=N;
+const FINT incxtmp=incX;
+return FORNAME(dnrm2) (&ntmp,X,&incxtmp);
+#else
 return FORNAME(dnrm2) (&N,X,&incX);
+#endif
 }

-extern "C" double FORNAME(dasum) (const int *n, const double *x, const int *incx);
+extern "C" double FORNAME(dasum) (const FINT *n, const double *x, const FINT *incx);
 double cblas_dasum(const int N, const double *X, const int incX)
 {
+#ifdef FORINT
+const FINT ntmp=N;
+const FINT incxtmp=incX;
+return FORNAME(dasum) (&ntmp,X,&incxtmp);
+#else
 return FORNAME(dasum) (&N,X,&incX);
+#endif
 }

-extern "C" void FORNAME(zcopy) (const int *n, const void *x, const int *incx, void *y, const int *incy);
+extern "C" void FORNAME(zcopy) (const FINT *n, const void *x, const FINT *incx, void *y, const FINT *incy);
 void cblas_zcopy(const int N, const void *X, const int incX,
                 void *Y, const int incY)
 {
+#ifdef FORINT
+const FINT ntmp=N;
+const FINT incxtmp=incX;
+const FINT incytmp=incY;
+FORNAME(zcopy) (&ntmp,X,&incxtmp,Y,&incytmp);
+#else
 FORNAME(zcopy) (&N,X,&incX,Y,&incY);
+#endif
 }

-extern "C" void FORNAME(zaxpy) (const int *n, const void *a, const void *x, const int *incx, void *y, const int *incy);
+extern "C" void FORNAME(zaxpy) (const FINT *n, const void *a, const void *x, const FINT *incx, void *y, const FINT *incy);
 void cblas_zaxpy(const int N, const void *alpha, const void *X,
                 const int incX, void *Y, const int incY)
 {
+#ifdef FORINT
+const FINT ntmp=N;
+const FINT incxtmp=incX;
+const FINT incytmp=incY;
+FORNAME(zaxpy) (&ntmp,alpha,X,&incxtmp,Y,&incytmp);
+#else
 FORNAME(zaxpy) (&N,alpha,X,&incX,Y,&incY);
+#endif
 }

-extern "C" void FORNAME(zscal) (const int *n, const void *a, void *x, const int *incx);
+extern "C" void FORNAME(zscal) (const FINT *n, const void *a, void *x, const FINT *incx);
 void cblas_zscal(const int N, const void *alpha, void *X, const int incX)
 {
+#ifdef FORINT
+const FINT ntmp=N;
+const FINT incxtmp=incX;
+FORNAME(zscal)(&ntmp,alpha,X,&incxtmp);
+#else
 FORNAME(zscal)(&N,alpha,X,&incX);
+#endif
 }

-extern "C" void FORNAME(zdscal) (const int *n, const double *a, void *x, const int *incx);
+extern "C" void FORNAME(zdscal) (const FINT *n, const double *a, void *x, const FINT *incx);
 void cblas_zdscal(const int N, const double alpha, void *X, const int incX)
 {
+#ifdef FORINT
+const FINT ntmp=N;
+const FINT incxtmp=incX;
+FORNAME(zdscal)(&ntmp,&alpha,X,&incxtmp);
+#else
 FORNAME(zdscal)(&N,&alpha,X,&incX);
+#endif
 }


-extern "C" double FORNAME(dznrm2) (const int *n, const void *x, const int *incx);
+extern "C" double FORNAME(dznrm2) (const FINT *n, const void *x, const FINT *incx);
 double cblas_dznrm2(const int N, const void *X, const int incX)
 {
+#ifdef FORINT
+const FINT ntmp=N;
+const FINT incxtmp=incX;
+return FORNAME(dznrm2) (&ntmp,X,&incxtmp);
+#else
 return FORNAME(dznrm2) (&N,X,&incX);
+#endif
 }


 //the following ones are f2c-compatible, but is it truly portable???

-extern "C" void FORNAME(zdotu) (void *retval, const int *n, const void *x, const int *incx, const void *y, const int *incy);
+extern "C" void FORNAME(zdotu) (void *retval, const FINT *n, const void *x, const FINT *incx, const void *y, const FINT *incy);

 void   cblas_zdotu_sub(const int N, const void *X, const int incX,
                       const void *Y, const int incY, void *dotu)
 {
+#ifdef FORINT
+const FINT ntmp=N;
+const FINT incxtmp=incX;
+const FINT incytmp=incY;
+FORNAME(zdotu) (dotu,&ntmp,X,&incxtmp,Y,&incytmp);
+#else
 FORNAME(zdotu) (dotu,&N,X,&incX,Y,&incY);
+#endif
 }


-extern "C" void FORNAME(zdotc) (void *retval, const int *n, const void *x, const int *incx, const void *y, const int *incy);
+extern "C" void FORNAME(zdotc) (void *retval, const FINT *n, const void *x, const FINT *incx, const void *y, const FINT *incy);
 void   cblas_zdotc_sub(const int N, const void *X, const int incX,
                       const void *Y, const int incY, void *dotc)
 {
+#ifdef FORINT
+const FINT ntmp=N;
+const FINT incxtmp=incX;
+const FINT incytmp=incY;
+FORNAME(zdotc) (dotc,&ntmp,X,&incxtmp,Y,&incytmp);
+#else
 FORNAME(zdotc) (dotc,&N,X,&incX,Y,&incY);
+#endif
 }


@@ -129,7 +209,7 @@ FORNAME(zdotc) (dotc,&N,X,&incX,Y,&incY);
 //enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, AtlasConj=114};


-extern "C" void FORNAME(dspmv) (const char *uplo, const int *n, const double *alpha, const double *ap, const double *x, const int *incx, const double *beta, double *y, const int *incy);
+extern "C" void FORNAME(dspmv) (const char *uplo, const FINT *n, const double *alpha, const double *ap, const double *x, const FINT *incx, const double *beta, double *y, const FINT *incy);
 void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
                 const int N, const double alpha, const double *Ap,
                 const double *X, const int incX,
@@ -137,11 +217,18 @@ void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
 {
 if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
 if(Uplo!=CblasLower) LA::laerror("CblasLower uplo asserted");
+#ifdef FORINT
+const FINT ntmp=N;
+const FINT incxtmp=incX;
+const FINT incytmp=incY;
+FORNAME(dspmv) ("U",&ntmp, &alpha, Ap, X, &incxtmp, &beta, Y, &incytmp);
+#else
 FORNAME(dspmv) ("U",&N, &alpha, Ap, X, &incX, &beta, Y, &incY);
+#endif
 }


-extern "C" void FORNAME(zhpmv) (const char *uplo, const int *n, const void *alpha, const void *ap, const void *x, const int *incx, const void *beta, void *y, const int *incy);
+extern "C" void FORNAME(zhpmv) (const char *uplo, const FINT *n, const void *alpha, const void *ap, const void *x, const FINT *incx, const void *beta, void *y, const FINT *incy);
 void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
                 const int N, const void *alpha, const void *Ap,
                 const void *X, const int incX,
@@ -149,20 +236,36 @@ void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
 {
 if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
 if(Uplo!=CblasLower) LA::laerror("CblasLower uplo asserted");
+#ifdef FORINT
+const FINT ntmp=N;
+const FINT incxtmp=incX;
+const FINT incytmp=incY;
+FORNAME(zhpmv) ("U",&ntmp, alpha, Ap, X, &incxtmp, beta, Y, &incytmp);
+#else
 FORNAME(zhpmv) ("U",&N, alpha, Ap, X, &incX, beta, Y, &incY);
+#endif
 }


 //Level 2 and Level 3 on general matrices - take into account the transposed storage of matrices in Fortran and C

-extern "C" void FORNAME(dger) (const int *m, const int *n, const double *alpha, const double *x, const int *incx, const double *y, const int *incy, double *a, const int *lda);
+extern "C" void FORNAME(dger) (const FINT *m, const FINT *n, const double *alpha, const double *x, const FINT *incx, const double *y, const FINT *incy, double *a, const FINT *lda);
 void cblas_dger(const enum CBLAS_ORDER Order, const int M, const int N,
                const double alpha, const double *X, const int incX,
                const double *Y, const int incY, double *A, const int lda)
 {
 if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
 //swap m-n, y-x
+#ifdef FORINT
+const FINT mtmp=M;
+const FINT ntmp=N;
+const FINT incxtmp=incX;
+const FINT incytmp=incY;
+const FINT ldatmp=lda;
+FORNAME(dger) (&ntmp, &mtmp, &alpha, Y, &incytmp, X, &incxtmp, A, &ldatmp);
+#else
 FORNAME(dger) (&N, &M, &alpha, Y, &incY, X, &incX, A, &lda);
+#endif
 }

 void cblas_zgerc(const enum CBLAS_ORDER Order, const int M, const int N,
@@ -179,7 +282,7 @@ void cblas_zgeru(const enum CBLAS_ORDER Order, const int M, const int N,
 LA::laerror("cblas_zgeru cannot be simply converted to fortran order");
 }

-extern "C" void FORNAME(dgemm) (const char *transa, const char *transb, const int *m, const int *n, const int *k, const double *alpha, const double *a, const int *lda, const double *b, const int *ldb, const double *beta, double *c, const int *ldc);
+extern "C" void FORNAME(dgemm) (const char *transa, const char *transb, const FINT *m, const FINT *n, const FINT *k, const double *alpha, const double *a, const FINT *lda, const double *b, const FINT *ldb, const double *beta, double *c, const FINT *ldc);
 void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
                 const int K, const double alpha, const double *A,
@@ -188,11 +291,22 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
 {
 if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
 //swap a-b, m-n
+#ifdef FORINT
+const FINT mtmp=M;
+const FINT ntmp=N;
+const FINT ktmp=K;
+const FINT ldatmp=lda;
+const FINT ldbtmp=ldb;
+const FINT ldctmp=ldc;
+FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T",
+	 &ntmp, &mtmp, &ktmp, &alpha, B, &ldbtmp, A, &ldatmp, &beta, C, &ldctmp);
+#else
 FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T",
 	 &N, &M, &K, &alpha, B, &ldb, A, &lda, &beta, C, &ldc);
+#endif
 }

-extern "C" void FORNAME(zgemm) (const char *transa, const char *transb, const int *m, const int *n, const int *k, const void *alpha, const void *a, const int *lda, const void *b, const int *ldb, const void *beta, void *c, const int *ldc);
+extern "C" void FORNAME(zgemm) (const char *transa, const char *transb, const FINT *m, const FINT *n, const FINT *k, const void *alpha, const void *a, const FINT *lda, const void *b, const FINT *ldb, const void *beta, void *c, const FINT *ldc);
 void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
                 const int K, const void *alpha, const void *A,
@@ -201,26 +315,49 @@ void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
 {
 if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
 //swap a-b, m-n
+#ifdef FORINT
+const FINT mtmp=M;
+const FINT ntmp=N;
+const FINT ktmp=K;
+const FINT ldatmp=lda;
+const FINT ldbtmp=ldb;
+const FINT ldctmp=ldc;
+FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
+		 TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
+         &ntmp, &mtmp, &ktmp, alpha, B, &ldbtmp, A, &ldatmp, beta, C, &ldctmp);
+#else
 FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
 		 TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
         &N, &M, &K, alpha, B, &ldb, A, &lda, beta, C, &ldc);
+#endif
 }


-extern "C" void FORNAME(dgemv)  (const char *TRANS, const int *M, const int *N, const double *ALPHA, const double *A,  const int *LDA,  const double *X,  const int *INCX, const double *BETA, double *Y, const int *INCY);
+extern "C" void FORNAME(dgemv)  (const char *TRANS, const FINT *M, const FINT *N, const double *ALPHA, const double *A,  const FINT *LDA,  const double *X,  const FINT *INCX, const double *BETA, double *Y, const FINT *INCY);
 void cblas_dgemv(const enum CBLAS_ORDER Order,
                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
                 const double alpha, const double *A, const int lda,
                 const double *X, const int incX, const double beta,
                 double *Y, const int incY)
 {
+#ifdef FORINT
+const FINT mtmp=M;
+const FINT ntmp=N;
+const FINT ldatmp=lda;
+const FINT incxtmp=incX;
+const FINT incytmp=incY;
+if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
+//swap n-m and toggle transposition
+else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
+#else
 if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
 //swap n-m and toggle transposition
 else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
+#endif
 }


-extern "C" void FORNAME(zgemv)  (const char *TRANS, const int *M, const int *N, const void *ALPHA, const void *A,  const int *LDA,  const void *X,  const int *INCX, const void *BETA, void *Y, const int *INCY);
+extern "C" void FORNAME(zgemv)  (const char *TRANS, const FINT *M, const FINT *N, const void *ALPHA, const void *A,  const FINT *LDA,  const void *X,  const FINT *INCX, const void *BETA, void *Y, const FINT *INCY);
 void cblas_zgemv(const enum CBLAS_ORDER Order,
                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
                 const void *alpha, const void *A, const int lda,
@@ -230,14 +367,29 @@ void cblas_zgemv(const enum CBLAS_ORDER Order,
 if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
 if(TransA == CblasConjTrans) LA::laerror("zgemv with CblasConjTrans not supportted");
 //swap n-m and toggle transposition
+#ifdef FORINT
+const FINT mtmp=M;
+const FINT ntmp=N;
+const FINT ldatmp=lda;
+const FINT incxtmp=incX;
+const FINT incytmp=incY;
+FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, alpha, A, &ldatmp, X, &incxtmp, beta, Y, &incytmp );
+#else
 FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, alpha, A, &lda, X, &incX, beta, Y, &incY );
+#endif
 }

-extern "C" int FORNAME(idamax) (const int *N, const double *DX, const int *INCX);
+extern "C" FINT FORNAME(idamax) (const FINT *N, const double *DX, const FINT *INCX);

 CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX)
 {
-return FORNAME(idamax)(&N,X,&incX);
+#ifdef FORINT
+const FINT ntmp=N;
+const FINT incxtmp=incX;
+return (CBLAS_INDEX)FORNAME(idamax)(&ntmp,X,&incxtmp);
+#else
+return (CBLAS_INDEX)FORNAME(idamax)(&N,X,&incX);
+#endif
 }


@@ -246,21 +398,38 @@ return FORNAME(idamax)(&N,X,&incX);
 #ifdef  NONCLAPACK
 //clapack_dgesv
 //allocate auxiliary storage and transpose input and output quantities to fortran/C order
-extern "C" void FORNAME(dgesv) (const int *N, const int *NRHS, double *A, const int *LDA, int *IPIV, double *B, const int *LDB, int *INFO);
+extern "C" void FORNAME(dgesv) (const FINT *N, const FINT *NRHS, double *A, const FINT *LDA, FINT *IPIV, double *B, const FINT *LDB, FINT *INFO);

 int clapack_dgesv(const enum CBLAS_ORDER Order, const int N, const int NRHS,
                  double *A, const int lda, int *ipiv,
                  double *B, const int ldb)
 {
-int INFO=0;
+FINT INFO=0;
 if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
 //B should be in the same physical order, just transpose A in place and the LU result on output
 for(int i=1; i<N; ++i) for(int j=0; j<i; ++j)  {double t=A[j*lda+i]; A[j*lda+i]=A[i*lda+j]; A[i*lda+j]=t;}
+#ifdef FORINT
+const FINT ntmp=N;
+const FINT nrhstmp=NRHS;
+const FINT ldatmp=lda;
+const FINT ldbtmp=ldb;
+FINT ipivtmp=*ipiv;
+FORNAME(dgesv) (&ntmp,&nrhstmp,A,&ldatmp,&ipivtmp,B,&ldbtmp,&INFO);
+#else
 FORNAME(dgesv) (&N,&NRHS,A,&lda,ipiv,B,&ldb,&INFO);
+#endif
 for(int i=1; i<N; ++i) for(int j=0; j<i; ++j)  {double t=A[j*lda+i]; A[j*lda+i]=A[i*lda+j]; A[i*lda+j]=t;}
-return INFO;
+return (int)INFO;
 }

 #endif

+void cblas_dswap(const int N, double *X, const int incX,
+                 double *Y, const int incY){
+  LA::laerror("cblas_dswap is not available... (-DNONCBLAS)");
+}

+void cblas_zswap(const int N, void *X, const int incX,
+                 void *Y, const int incY){
+  LA::laerror("cblas_zswap is not available... (-DNONCBLAS)");
+}		 
--- a/nonclass.cc
+++ b/nonclass.cc
@@ -23,14 +23,11 @@
 #include "mat.h"
 #include "nonclass.h"
 #include "qsort.h"
+#include "fortran.h"
+

 namespace LA {

-#ifdef FORTRAN_
-#define FORNAME(x) x##_
-#else
-#define FORNAME(x) x
-#endif

 #define INSTANTIZE(T) \
 template void lawritemat(FILE *file,const T *a,int r,int c,const char *form0, \
@@ -191,16 +188,23 @@ linear_solve_do(A,&B[0],1,A.nrows(),det,n);
 // Next routines are not available in clapack, fotran ones will be used with an
 // additional swap/transpose of outputs when needed

-extern "C" void FORNAME(dspsv)(const char *UPLO, const int *N, const int *NRHS,
-		double *AP, int *IPIV, double *B, const int *LDB, int *INFO);
+extern "C" void FORNAME(dspsv)(const char *UPLO, const FINT *N, const FINT *NRHS,
+		double *AP, FINT *IPIV, double *B, const FINT *LDB, FINT *INFO);

 static void linear_solve_do(NRSMat<double> &a, double *b, const int nrhs, const int ldb, double *det, int n)
 {
-	int r, *ipiv;
+	FINT r, *ipiv;
 	a.copyonwrite();
-	ipiv = new int[n];
+	ipiv = new FINT[n];
 	char U = 'U';
+#ifdef FORINT
+        const FINT ntmp=n;
+        const FINT nrhstmp=nrhs;
+	const FINT ldbtmp=ldb;
+	FORNAME(dspsv)(&U, &ntmp, &nrhstmp, a, ipiv, b, &ldbtmp,&r);
+#else	
 	FORNAME(dspsv)(&U, &n, &nrhs, a, ipiv, b, &ldb,&r);
+#endif	
 	if (r < 0) {
 		delete[] ipiv;
 		laerror("illegal argument in spsv() call of linear_solve()");
@@ -239,8 +243,8 @@ linear_solve_do(a,&B[0],1,a.nrows(),det,n);
 //other version of linear solver based on gesvx

 //------------------------------------------------------------------------------
-extern "C" void FORNAME(zgesvx)(const char *fact, const char *trans, const int *n, const int *nrhs, complex<double> *A,  const int *lda, complex<double> *AF, const int *ldaf, const int *ipiv, char *equed, double *R,double *C, complex<double> *B, const int *ldb, complex<double> *X, const int *ldx, double *rcond, double *ferr, double *berr, complex<double> *work, double *rwork, int *info);
-extern "C" void FORNAME(dgesvx)(const char *fact, const char *trans, const int *n, const int *nrhs, double *A,  const int *lda, double *AF, const int *ldaf, const int *ipiv, char *equed, double *R,double *C, double *B, const int *ldb, double *X, const int *ldx, double *rcond, double *ferr, double *berr, double *work, int *iwork, int *info);
+extern "C" void FORNAME(zgesvx)(const char *fact, const char *trans, const FINT *n, const FINT *nrhs, complex<double> *A,  const FINT *lda, complex<double> *AF, const FINT *ldaf, const FINT *ipiv, char *equed, double *R,double *C, complex<double> *B, const FINT *ldb, complex<double> *X, const FINT *ldx, double *rcond, double *ferr, double *berr, complex<double> *work, double *rwork, FINT *info);
+extern "C" void FORNAME(dgesvx)(const char *fact, const char *trans, const FINT *n, const FINT *nrhs, double *A,  const FINT *lda, double *AF, const FINT *ldaf, const FINT *ipiv, char *equed, double *R,double *C, double *B, const FINT *ldb, double *X, const FINT *ldx, double *rcond, double *ferr, double *berr, double *work, FINT *iwork, FINT *info);
 //------------------------------------------------------------------------------
 // solves set of linear equations using dgesvx
 // input:
@@ -270,18 +274,18 @@ int linear_solve_x(NRMat<double> &_A, double *_B, const int _rhsCount, const int
 	double *A;
 	double * const _A_data = (double*)_A;

-	int info;
-	const int nrhs	= _rhsCount;
-	const int n	= _eqCount;
-	int lda		= A_cols;
-	const int ldaf	= lda;
+	FINT info;
+	const FINT nrhs	= _rhsCount;
+	const FINT n	= _eqCount;
+	FINT lda	= A_cols;
+	const FINT ldaf	= lda;

 	double rcond;
 	double ferr[nrhs], berr[nrhs], work[4*n];
 	double R[n], C[n];

-	int *const iwork = new int[n];
-	int *const ipiv  = new int[n];
+	FINT *const iwork = new FINT[n];
+	FINT *const ipiv  = new FINT[n];

 	double *X  = new double[n*nrhs];
 	double *AF = new double[ldaf*n];
@@ -312,7 +316,7 @@ int linear_solve_x(NRMat<double> &_A, double *_B, const int _rhsCount, const int
 	if(_saveA){
 		delete[] A;
 	}
-	return info;
+	return (int)info;
 }
 //------------------------------------------------------------------------------
 // solves set of linear equations using zgesvx
@@ -343,18 +347,18 @@ int linear_solve_x(NRMat<complex<double> > &_A, complex<double> *_B, const int _
 	complex<double> *A;
 	complex<double> * const _A_data = (complex<double>*)_A;

-	int info;
-	const int nrhs	= _rhsCount;
-	const int n	= _eqCount;
-	int lda		= A_cols;
-	const int ldaf	= lda;
+	FINT info;
+	const FINT nrhs	= _rhsCount;
+	const FINT n	= _eqCount;
+	FINT lda	= A_cols;
+	const FINT ldaf	= lda;

 	double rcond;
 	double ferr[nrhs], berr[nrhs];
 	double R[n], C[n], rwork[2*n];
 	complex<double> work[2*n];

-	int *const ipiv = new int[n];
+	FINT *const ipiv = new FINT[n];

 	complex<double> *X  = new complex<double>[n*nrhs];
 	complex<double> *AF = new complex<double>[ldaf*n];
@@ -386,7 +390,7 @@ int linear_solve_x(NRMat<complex<double> > &_A, complex<double> *_B, const int _
 	if(_saveA){
 		delete[] A;
 	}
-	return info;
+	return (int)info;
 }
 //------------------------------------------------------------------------------
 // for given square matrices A, B computes X = AB^{-1} as follows
@@ -402,8 +406,8 @@ int linear_solve_x(NRMat<complex<double> > &_A, complex<double> *_B, const int _
 //------------------------------------------------------------------------------
 int multiply_by_inverse(NRMat<double> &_A, NRMat<double> &_B, bool _useEq, double *_rcond){
 	
-	const int n = _A.nrows();
-	const int m = _A.ncols();
+	const FINT n = _A.nrows();
+	const FINT m = _A.ncols();
 	if(n != m || n != _B.nrows() || n != _B.ncols()){
 		laerror("multiply_by_inverse: incompatible matrices");
 	}
@@ -417,13 +421,13 @@ int multiply_by_inverse(NRMat<double> &_A, NRMat<double> &_B, bool _useEq, doubl
 	double * const B = (double*)_B;
 	_B.copyonwrite();//even if fact='N', call copyonwrite because the solution is going to be stored in _B

-	int info;
+	FINT info;
 	double rcond;
 	double ferr[n], berr[n], work[4*n];
 	double R[n], C[n];

-	int *const iwork = new int[n];
-	int *const ipiv  = new int[n];
+	FINT *const iwork = new FINT[n];
+	FINT *const ipiv  = new FINT[n];

 	double *X  = new double[n2];
 	double *AF = new double[n2];
@@ -437,7 +441,7 @@ int multiply_by_inverse(NRMat<double> &_A, NRMat<double> &_B, bool _useEq, doubl
 	delete[] iwork;delete[] ipiv;
 	delete[] AF;delete[] X;
 	
-	return info;
+	return (int)info;
 }
 //------------------------------------------------------------------------------
 // for given square matrices A, B computes X = AB^{-1} as follows
@@ -453,8 +457,8 @@ int multiply_by_inverse(NRMat<double> &_A, NRMat<double> &_B, bool _useEq, doubl
 //------------------------------------------------------------------------------
 int multiply_by_inverse(NRMat<complex<double> > &_A, NRMat<complex<double> > &_B, bool _useEq, double *_rcond){
 	
-	const int n = _A.nrows();
-	const int m = _A.ncols();
+	const FINT n = _A.nrows();
+	const FINT m = _A.ncols();
 	if(n != m || n != _B.nrows() || n != _B.ncols()){
 		laerror("multiply_by_inverse: incompatible matrices");
 	}
@@ -468,13 +472,13 @@ int multiply_by_inverse(NRMat<complex<double> > &_A, NRMat<complex<double> > &_B
 	complex<double> * const B = (complex<double>*)_B;
 	_B.copyonwrite();//even if fact='N', call copyonwrite because the solution is going to be stored in _B

-	int info;
+	FINT info;
 	double rcond;
 	double ferr[n], berr[n];
 	double R[n], C[n], rwork[2*n];
 	complex<double> work[2*n];

-	int *const ipiv = new int[n];
+	FINT *const ipiv = new FINT[n];

 	complex<double> *X  = new complex<double>[n2];
 	complex<double> *AF = new complex<double>[n2];
@@ -488,24 +492,24 @@ int multiply_by_inverse(NRMat<complex<double> > &_A, NRMat<complex<double> > &_B
 	delete[] ipiv;
 	delete[] AF;delete[] X;
 	
-	return info;
+	return (int)info;
 }
 //------------------------------------------------------------------------------



-extern "C" void FORNAME(dsyev)(const char *JOBZ, const char *UPLO, const int *N,
-		double *A, const int *LDA, double *W, double *WORK, const int *LWORK, int *INFO);
+extern "C" void FORNAME(dsyev)(const char *JOBZ, const char *UPLO, const FINT *N,
+		double *A, const FINT *LDA, double *W, double *WORK, const FINT *LWORK, FINT *INFO);

-extern "C" void FORNAME(dsygv)(const int *ITYPE, const char *JOBZ, const char *UPLO, const int *N,
-                double *A, const int *LDA, double *B, const int *LDB, double *W, double *WORK, const int *LWORK, int *INFO);
+extern "C" void FORNAME(dsygv)(const FINT *ITYPE, const char *JOBZ, const char *UPLO, const FINT *N,
+                double *A, const FINT *LDA, double *B, const FINT *LDB, double *W, double *WORK, const FINT *LWORK, FINT *INFO);


 // a will contain eigenvectors (columns if corder==1), w eigenvalues
 void diagonalize(NRMat<double> &a, NRVec<double> &w, const bool eivec, 
 		const bool corder, int n, NRMat<double> *b, const int itype)
 {
-	int m = a.nrows();
+	FINT m = a.nrows();
 	if (m != a.ncols()) laerror("diagonalize() call with non-square matrix");
 	if (a.nrows() != w.size()) 
 		laerror("inconsistent dimension of eigenvalue vector in diagonalize()");
@@ -517,22 +521,38 @@ void diagonalize(NRMat<double> &a, NRVec<double> &w, const bool eivec,
 	w.copyonwrite();
 	if(b) b->copyonwrite();

-	int r = 0;
+	FINT r = 0;
 	char U ='U';
 	char vectors = 'V';
 	if (!eivec) vectors = 'N';
-	int LWORK = -1;
+	FINT LWORK = -1;
 	double WORKX;
-	int ldb=0; if(b) ldb=b->ncols();
+	FINT ldb=0; if(b) ldb=b->ncols();

+#ifdef FORINT
+        const FINT itypetmp = itype;
+	FINT ntmp = n;
+	// First call is to determine size of workspace
+	if(b) FORNAME(dsygv)(&itypetmp,&vectors, &U, &ntmp, a, &m, *b, &ldb, w, &WORKX, &LWORK, &r );
+	else FORNAME(dsyev)(&vectors, &U, &ntmp, a, &m, w, &WORKX, &LWORK, &r );
+#else
 	// First call is to determine size of workspace
 	if(b) FORNAME(dsygv)(&itype,&vectors, &U, &n, a, &m, *b, &ldb, w, &WORKX, &LWORK, &r );
 	else FORNAME(dsyev)(&vectors, &U, &n, a, &m, w, &WORKX, &LWORK, &r );
-	LWORK = (int)WORKX;
+#endif
+
+        LWORK = (FINT)WORKX;
 	double *WORK = new double[LWORK];
-	if(b) FORNAME(dsygv)(&itype,&vectors, &U, &n, a, &m, *b,&ldb, w, WORK, &LWORK, &r );
+
+#ifdef FORINT
+	if(b) FORNAME(dsygv)(&itypetmp,&vectors, &U, &ntmp, a, &m, *b, &ldb, w, &WORKX, &LWORK, &r );
+        else FORNAME(dsyev)(&vectors, &U, &ntmp, a, &m, w, &WORKX, &LWORK, &r );
+#else
+        if(b) FORNAME(dsygv)(&itype,&vectors, &U, &n, a, &m, *b,&ldb, w, WORK, &LWORK, &r );
 	else FORNAME(dsyev)(&vectors, &U, &n, a, &m, w, WORK, &LWORK, &r );
-	delete[] WORK;
+#endif
+
+        delete[] WORK;
 	if (vectors == 'V' && corder) a.transposeme(n);

 	if (r < 0) laerror("illegal argument in sygv/syev in diagonalize()");
@@ -541,18 +561,18 @@ void diagonalize(NRMat<double> &a, NRVec<double> &w, const bool eivec,



-extern "C" void FORNAME(zheev)(const char *JOBZ, const char *UPLO, const int *N,
-		complex<double> *A, const int *LDA, double *W, complex<double> *WORK, const int *LWORK, double *RWORK, int *INFO);
+extern "C" void FORNAME(zheev)(const char *JOBZ, const char *UPLO, const FINT *N,
+		complex<double> *A, const FINT *LDA, double *W, complex<double> *WORK, const FINT *LWORK, double *RWORK, FINT *INFO);

-extern "C" void FORNAME(zhegv)(const int *ITYPE, const char *JOBZ, const char *UPLO, const int *N,
-                complex<double> *A, const int *LDA, complex<double> *B, const int *LDB, double *W, complex<double> *WORK, const int *LWORK, double *RWORK, int *INFO);
+extern "C" void FORNAME(zhegv)(const FINT *ITYPE, const char *JOBZ, const char *UPLO, const FINT *N,
+                complex<double> *A, const FINT *LDA, complex<double> *B, const FINT *LDB, double *W, complex<double> *WORK, const FINT *LWORK, double *RWORK, FINT *INFO);


 // a will contain eigenvectors (columns if corder==1), w eigenvalues
 void diagonalize(NRMat<complex<double> > &a, NRVec<double> &w, const bool eivec, 
 		const bool corder, int n, NRMat<complex<double> > *b, const int itype)
 {
-	int m = a.nrows();
+	FINT m = a.nrows();
 	if (m != a.ncols()) laerror("diagonalize() call with non-square matrix");
 	if (a.nrows() != w.size()) 
 		laerror("inconsistent dimension of eigenvalue vector in diagonalize()");
@@ -564,23 +584,38 @@ void diagonalize(NRMat<complex<double> > &a, NRVec<double> &w, const bool eivec,
 	w.copyonwrite();
 	if(b) b->copyonwrite();

-	int r = 0;
+	FINT r = 0;
 	char U ='U';
 	char vectors = 'V';
 	if (!eivec) vectors = 'N';
-	int LWORK = -1;
+	FINT LWORK = -1;
 	complex<double> WORKX;
-	int ldb=0; if(b) ldb=b->ncols();
+	FINT ldb=0; if(b) ldb=b->ncols();

 	// First call is to determine size of workspace
 	double *RWORK = new double[3*n+2]; 
-	if(b) FORNAME(zhegv)(&itype,&vectors, &U, &n, a, &m, *b, &ldb, w, &WORKX, &LWORK, RWORK, &r );
+#ifdef FORINT
+        const FINT itypetmp = itype;
+	FINT ntmp = n;
+        if(b) FORNAME(zhegv)(&itypetmp,&vectors, &U, &ntmp, a, &m, *b, &ldb, w, &WORKX, &LWORK, RWORK, &r );
+	else FORNAME(zheev)(&vectors, &U, &ntmp, a, &m, w, &WORKX, &LWORK, RWORK, &r );
+#else
+        if(b) FORNAME(zhegv)(&itype,&vectors, &U, &n, a, &m, *b, &ldb, w, &WORKX, &LWORK, RWORK, &r );
 	else FORNAME(zheev)(&vectors, &U, &n, a, &m, w, &WORKX, &LWORK, RWORK, &r );
-	LWORK = (int)WORKX.real();
+#endif	
+
+        LWORK = (FINT)WORKX.real();
 	complex<double> *WORK = new complex<double>[LWORK];
-	if(b) FORNAME(zhegv)(&itype,&vectors, &U, &n, a, &m, *b,&ldb, w, WORK, &LWORK, RWORK, &r );
+
+#ifdef FORINT
+        if(b) FORNAME(zhegv)(&itypetmp,&vectors, &U, &ntmp, a, &m, *b, &ldb, w, &WORKX, &LWORK, RWORK, &r );
+        else FORNAME(zheev)(&vectors, &U, &ntmp, a, &m, w, &WORKX, &LWORK, RWORK, &r );
+#else	
+        if(b) FORNAME(zhegv)(&itype,&vectors, &U, &n, a, &m, *b,&ldb, w, WORK, &LWORK, RWORK, &r );
 	else FORNAME(zheev)(&vectors, &U, &n, a, &m, w, WORK, &LWORK, RWORK, &r );
-	delete[] WORK;
+#endif	
+
+        delete[] WORK;
 	delete[] RWORK;
 	if (vectors == 'V' && corder) a.transposeme(n);

@@ -590,11 +625,11 @@ void diagonalize(NRMat<complex<double> > &a, NRVec<double> &w, const bool eivec,



-extern "C" void FORNAME(dspev)(const char *JOBZ, const char *UPLO, const int *N,
-		double *AP, double *W, double *Z, const int *LDZ, double *WORK, int *INFO);
+extern "C" void FORNAME(dspev)(const char *JOBZ, const char *UPLO, const FINT *N,
+		double *AP, double *W, double *Z, const FINT *LDZ, double *WORK, FINT *INFO);

-extern "C" void FORNAME(dspgv)(const int *ITYPE, const char *JOBZ, const char *UPLO, const int *N,
-                double *AP, double *BP, double *W, double *Z, const int *LDZ, double *WORK, int *INFO);
+extern "C" void FORNAME(dspgv)(const FINT *ITYPE, const char *JOBZ, const char *UPLO, const FINT *N,
+                double *AP, double *BP, double *W, double *Z, const FINT *LDZ, double *WORK, FINT *INFO);


 // v will contain eigenvectors, w eigenvalues
@@ -613,14 +648,21 @@ void diagonalize(NRSMat<double> &a, NRVec<double> &w, NRMat<double> *v,
 	if(v) v->copyonwrite();
 	if(b) b->copyonwrite();

-	int r = 0;
+	FINT r = 0;
 	char U = 'U';
 	char job = v ? 'v' : 'n';

 	double *WORK = new double[3*n];
-	int ldv=v?v->ncols():n;
-	if(b) FORNAME(dspgv)(&itype,&job, &U, &n, a, *b, w, v?(*v)[0]:(double *)0, &ldv, WORK,  &r );
+	FINT ldv=v?v->ncols():n;
+#ifdef FORINT
+        const FINT itypetmp = itype;
+	FINT ntmp = n;
+        if(b) FORNAME(dspgv)(&itypetmp,&job, &U, &ntmp, a, *b, w, v?(*v)[0]:(double *)0, &ldv, WORK,  &r );
+	else FORNAME(dspev)(&job, &U, &ntmp, a, w, v?(*v)[0]:(double *)0, &ldv, WORK,  &r );
+#else
+        if(b) FORNAME(dspgv)(&itype,&job, &U, &n, a, *b, w, v?(*v)[0]:(double *)0, &ldv, WORK,  &r );
 	else FORNAME(dspev)(&job, &U, &n, a, w, v?(*v)[0]:(double *)0, &ldv, WORK,  &r );
+#endif
 	delete[] WORK;
 	if (v && corder) v->transposeme(n);

@@ -629,11 +671,11 @@ void diagonalize(NRSMat<double> &a, NRVec<double> &w, NRMat<double> *v,
 }


-extern "C" void FORNAME(zhpev)(const char *JOBZ, const char *UPLO, const int *N,
-		complex<double> *AP, double *W, complex<double> *Z, const int *LDZ, complex<double> *WORK, double *RWORK, int *INFO);
+extern "C" void FORNAME(zhpev)(const char *JOBZ, const char *UPLO, const FINT *N,
+		complex<double> *AP, double *W, complex<double> *Z, const FINT *LDZ, complex<double> *WORK, double *RWORK, FINT *INFO);

-extern "C" void FORNAME(zhpgv)(const int *ITYPE, const char *JOBZ, const char *UPLO, const int *N,
-                complex<double> *AP, complex<double> *BP, double *W, complex<double> *Z, const int *LDZ, complex<double> *WORK,  double *RWORK, int *INFO);
+extern "C" void FORNAME(zhpgv)(const FINT *ITYPE, const char *JOBZ, const char *UPLO, const FINT *N,
+                complex<double> *AP, complex<double> *BP, double *W, complex<double> *Z, const FINT *LDZ, complex<double> *WORK,  double *RWORK, FINT *INFO);


 // v will contain eigenvectors, w eigenvalues
@@ -652,15 +694,22 @@ void diagonalize(NRSMat<complex<double> > &a, NRVec<double> &w, NRMat<complex<do
 	if(v) v->copyonwrite();
 	if(b) b->copyonwrite();

-	int r = 0;
+	FINT r = 0;
 	char U = 'U';
 	char job = v ? 'v' : 'n';

 	complex<double> *WORK = new complex<double>[2*n];
 	double *RWORK  = new double[3*n];
-	int ldv=v?v->ncols():n;
+	FINT ldv=v?v->ncols():n;
+#ifdef FORINT	
+        const FINT itypetmp = itype;
+        FINT ntmp = n;
+	if(b) FORNAME(zhpgv)(&itypetmp,&job, &U, &ntmp, a, *b, w, v?(*v)[0]:(complex<double> *)0, &ldv, WORK, RWORK,  &r );
+	else FORNAME(zhpev)(&job, &U, &ntmp, a, w, v?(*v)[0]:(complex<double> *)0, &ldv, WORK, RWORK,  &r );
+#else
 	if(b) FORNAME(zhpgv)(&itype,&job, &U, &n, a, *b, w, v?(*v)[0]:(complex<double> *)0, &ldv, WORK, RWORK,  &r );
 	else FORNAME(zhpev)(&job, &U, &n, a, w, v?(*v)[0]:(complex<double> *)0, &ldv, WORK, RWORK,  &r );
+#endif
 	delete[] WORK;
 	delete[] RWORK;
 	if (v && corder) v->transposeme(n);
@@ -671,17 +720,17 @@ void diagonalize(NRSMat<complex<double> > &a, NRVec<double> &w, NRMat<complex<do



-extern "C" void FORNAME(dgesvd)(const char *JOBU,  const char *JOBVT,  const int *M,
-		const int *N,  double *A, const int *LDA, double *S, double *U, const int *LDU,
-		double *VT, const int *LDVT, double *WORK, const int *LWORK, int *INFO );
+extern "C" void FORNAME(dgesvd)(const char *JOBU,  const char *JOBVT,  const FINT *M,
+		const FINT *N,  double *A, const FINT *LDA, double *S, double *U, const FINT *LDU,
+		double *VT, const FINT *LDVT, double *WORK, const FINT *LWORK, FINT *INFO );

 void singular_decomposition(NRMat<double> &a, NRMat<double> *u, NRVec<double> &s,
 		NRMat<double> *v, const bool corder, int m, int n)
 {
-	int m0 = a.nrows();
-	int n0 = a.ncols();
-	if(m<=0) m=m0;
-	if(n<=0) n=n0;
+	FINT m0 = a.nrows();
+	FINT n0 = a.ncols();
+	if(m<=0) m=(int)m0;
+	if(n<=0) n=(int)n0;
 	if(n>n0 || m>m0) laerror("bad dimension in singular_decomposition");
 	if (u) if (m > u->nrows() || m> u->ncols())
 		laerror("inconsistent dimension of U Mat in singular_decomposition()");
@@ -700,14 +749,30 @@ void singular_decomposition(NRMat<double> &a, NRMat<double> *u, NRVec<double> &s
 	char jobu = u ? 'A' : 'N';
 	char jobv = v ? 'A' : 'N';
 	double work0;
-	int lwork = -1;
-	int r;
+	FINT lwork = -1;
+	FINT r;
+
+#ifdef FORINT
+        FINT ntmp = n;
+	FINT mtmp = m;
+	FORNAME(dgesvd)(&jobv, &jobu, &ntmp, &mtmp, a, &n0, s, v?(*v)[0]:0, &n0,
+			u?(*u)[0]:0, &m0, &work0, &lwork, &r);
+#else
 	FORNAME(dgesvd)(&jobv, &jobu, &n, &m, a, &n0, s, v?(*v)[0]:0, &n0,
 			u?(*u)[0]:0, &m0, &work0, &lwork, &r);
-	lwork = (int) work0;
+#endif
+
+	lwork = (FINT) work0;
 	double *work = new double[lwork];
+
+#ifdef FORINT
+	FORNAME(dgesvd)(&jobv, &jobu, &ntmp, &mtmp, a, &n0, s, v?(*v)[0]:0, &n0,
+			u?(*u)[0]:0, &m0, work, &lwork, &r);
+#else
 	FORNAME(dgesvd)(&jobv, &jobu, &n, &m, a, &n0, s, v?(*v)[0]:0, &n0,
 			u?(*u)[0]:0, &m0, work, &lwork, &r);
+#endif
+
 	delete[] work;
 	if (v && corder) v->transposeme(n);

@@ -719,14 +784,14 @@ void singular_decomposition(NRMat<double> &a, NRMat<double> *u, NRVec<double> &s
 //extern "C" void FORNAME(dgeqrf)(const int *M, const int *N, double *A, const int *LDA, double *TAU, double *WORK, int *LWORK, int *INFO);


-extern "C" void FORNAME(dgeev)(const char *JOBVL, const char *JOBVR, const int *N,
-		double *A, const int *LDA, double *WR, double *WI, double *VL, const int *LDVL,
-		double *VR, const int *LDVR, double *WORK, const int *LWORK, int *INFO );
+extern "C" void FORNAME(dgeev)(const char *JOBVL, const char *JOBVR, const FINT *N,
+		double *A, const FINT *LDA, double *WR, double *WI, double *VL, const FINT *LDVL,
+		double *VR, const FINT *LDVR, double *WORK, const FINT *LWORK, FINT *INFO );

-extern "C" void FORNAME(dggev)(const char *JOBVL, const char *JOBVR, const int *N,
-		double *A, const int *LDA, double *B, const int *LDB, double *WR, double *WI,  double *WBETA, 
-		 double *VL, const int *LDVL,  double *VR, const int *LDVR,  
-		double *WORK, const int *LWORK, int *INFO );
+extern "C" void FORNAME(dggev)(const char *JOBVL, const char *JOBVR, const FINT *N,
+		double *A, const FINT *LDA, double *B, const FINT *LDB, double *WR, double *WI,  double *WBETA, 
+		 double *VL, const FINT *LDVL,  double *VR, const FINT *LDVR,  
+		double *WORK, const FINT *LWORK, FINT *INFO );


 //statics for sorting
@@ -802,23 +867,41 @@ void gdiagonalize(NRMat<double> &a, NRVec<double> &wr, NRVec<double> &wi,
 	char jobvl = vl ? 'V' : 'N';
 	char jobvr = vr ? 'V' : 'N';
 	double work0;
-	int lwork = -1;
-	int r;
-	int lda=a.ncols();
-	int ldb=0;
+	FINT lwork = -1;
+	FINT r;
+	FINT lda=a.ncols();
+	FINT ldb=0;
 	if(b) ldb=b->ncols();
-	int ldvl= vl?vl->ncols():lda;
-	int ldvr= vr?vr->ncols():lda;
-	if(b)  FORNAME(dggev)(&jobvr, &jobvl, &n, a, &lda, *b, &ldb, wr, wi, *beta, vr?vr[0]:(double *)0,
+	FINT ldvl= vl?vl->ncols():lda;
+	FINT ldvr= vr?vr->ncols():lda;
+
+#ifdef FORINT
+        FINT ntmp = n; 
+        if(b)  FORNAME(dggev)(&jobvr, &jobvl, &ntmp, a, &lda, *b, &ldb, wr, wi, *beta, vr?vr[0]:(double *)0,
+                        &ldvr, vl?vl[0]:(double *)0, &ldvl, &work0, &lwork, &r);
+	else FORNAME(dgeev)(&jobvr, &jobvl, &ntmp, a, &lda, wr, wi, vr?vr[0]:(double *)0,
+			&ldvr, vl?vl[0]:(double *)0, &ldvl, &work0, &lwork, &r);
+#else
+        if(b)  FORNAME(dggev)(&jobvr, &jobvl, &n, a, &lda, *b, &ldb, wr, wi, *beta, vr?vr[0]:(double *)0,
                        &ldvr, vl?vl[0]:(double *)0, &ldvl, &work0, &lwork, &r);
 	else FORNAME(dgeev)(&jobvr, &jobvl, &n, a, &lda, wr, wi, vr?vr[0]:(double *)0,
 			&ldvr, vl?vl[0]:(double *)0, &ldvl, &work0, &lwork, &r);
-	lwork = (int) work0;
+#endif
+
+        lwork = (FINT) work0;
 	double *work = new double[lwork];
+
+#ifdef FORINT	
+	if(b) FORNAME(dggev)(&jobvr, &jobvl, &ntmp, a, &lda, *b, &ldb, wr, wi, *beta, vr?vr[0]:(double *)0,
+                        &ldvr, vl?vl[0]:(double *)0, &ldvl, work, &lwork, &r);
+	else FORNAME(dgeev)(&jobvr, &jobvl, &ntmp, a, &lda, wr, wi, vr?vr[0]:(double *)0,
+			&ldvr, vl?vl[0]:(double *)0, &ldvl, work, &lwork, &r);
+#else
 	if(b) FORNAME(dggev)(&jobvr, &jobvl, &n, a, &lda, *b, &ldb, wr, wi, *beta, vr?vr[0]:(double *)0,
                        &ldvr, vl?vl[0]:(double *)0, &ldvl, work, &lwork, &r);
 	else FORNAME(dgeev)(&jobvr, &jobvl, &n, a, &lda, wr, wi, vr?vr[0]:(double *)0,
 			&ldvr, vl?vl[0]:(double *)0, &ldvl, work, &lwork, &r);
+#endif
 	delete[] work;

 //std::cout <<"TEST dgeev\n"<<wr<<wi<<*vr<<*vl<<std::endl;
@@ -1208,16 +1291,16 @@ return r;


 //Cholesky interface
-extern "C" void FORNAME(dpotrf)(const char *UPLO, const int *N, double *A, const int *LDA, int *INFO);
-extern "C" void FORNAME(zpotrf)(const char *UPLO, const int *N, complex<double> *A, const int *LDA, int *INFO);
+extern "C" void FORNAME(dpotrf)(const char *UPLO, const FINT *N, double *A, const FINT *LDA, FINT *INFO);
+extern "C" void FORNAME(zpotrf)(const char *UPLO, const FINT *N, complex<double> *A, const FINT *LDA, FINT *INFO);

 void cholesky(NRMat<double> &a, bool upper)
 {
 if(a.nrows()!=a.ncols()) laerror("matrix must be square in Cholesky");
-int lda=a.ncols();
-int n=a.nrows();
+FINT lda=a.ncols();
+FINT n=a.nrows();
 char uplo=upper?'U':'L';
-int info;
+FINT info;
 a.copyonwrite();
 FORNAME(dpotrf)(&uplo, &n, a, &lda, &info);
 if(info) {std::cerr << "Lapack error "<<info<<std::endl; laerror("error in Cholesky");}
@@ -1233,10 +1316,10 @@ else
 void cholesky(NRMat<complex<double> > &a, bool upper)
 {
 if(a.nrows()!=a.ncols()) laerror("matrix must be square in Cholesky");
-int lda=a.ncols();
-int n=a.nrows();
+FINT lda=a.ncols();
+FINT n=a.nrows();
 char uplo=upper?'U':'L';
-int info;
+FINT info;
 a.copyonwrite();
 a.transposeme();//switch to Fortran order
 FORNAME(zpotrf)(&uplo, &n, a, &lda, &info);
@@ -1250,14 +1333,14 @@ else


 //various norms
-extern "C" double FORNAME(zlange)( const char *NORM, const int *M, const int *N, complex<double> *A, const int *LDA, double *WORK); 
-extern "C" double FORNAME(dlange)( const char *NORM, const int *M, const int *N, double *A, const int *LDA, double *WORK); 
+extern "C" double FORNAME(zlange)( const char *NORM, const FINT *M, const FINT *N, complex<double> *A, const FINT *LDA, double *WORK); 
+extern "C" double FORNAME(dlange)( const char *NORM, const FINT *M, const FINT *N, double *A, const FINT *LDA, double *WORK); 

 double MatrixNorm(NRMat<complex<double> > &A, const char norm)
 {
        const char TypNorm = (tolower(norm) == 'o')?'I':'O'; //switch c-order/fortran-order
-        const int M = A.nrows();
-        const int N = A.ncols();
+        const FINT M = A.nrows();
+        const FINT N = A.ncols();
        double work[M];
        const double ret = FORNAME(zlange)(&TypNorm, &M, &N, A[0], &M, &work[0]);
        return ret;
@@ -1266,8 +1349,8 @@ double MatrixNorm(NRMat<complex<double> > &A, const char norm)
 double MatrixNorm(NRMat<double > &A, const char norm)
 {
        const char TypNorm = (tolower(norm) == 'o')?'I':'O'; //switch c-order/fortran-order
-        const int M = A.nrows();
-        const int N = A.ncols();
+        const FINT M = A.nrows();
+        const FINT N = A.ncols();
        double work[M];
        const double ret = FORNAME(dlange)(&TypNorm, &M, &N, A[0], &M, &work[0]);
        return ret;
@@ -1276,15 +1359,15 @@ double MatrixNorm(NRMat<double > &A, const char norm)


 //condition number
-extern "C" void FORNAME(zgecon)( const char *norm, const int *n, complex<double> *A, const int *LDA, const double *anorm, double *rcond, complex<double> *work, double *rwork, int *info);
-extern "C" void FORNAME(dgecon)( const char *norm, const int *n, double *A, const int *LDA, const double *anorm, double *rcond, double *work, double *rwork, int *info);
+extern "C" void FORNAME(zgecon)( const char *norm, const FINT *n, complex<double> *A, const FINT *LDA, const double *anorm, double *rcond, complex<double> *work, double *rwork, FINT *info);
+extern "C" void FORNAME(dgecon)( const char *norm, const FINT *n, double *A, const FINT *LDA, const double *anorm, double *rcond, double *work, double *rwork, FINT *info);

 double CondNumber(NRMat<complex<double> > &A, const char norm)
 {
        const char TypNorm = (tolower(norm) == 'o')?'I':'O'; //switch c-order/fortran-order
-        const int N = A.nrows();
+        const FINT N = A.nrows();
        double Norma(0.0), ret(0.0);
-        int info;
+        FINT info;
        complex<double> *work;
        double *rwork;

@@ -1305,9 +1388,9 @@ double CondNumber(NRMat<complex<double> > &A, const char norm)
 double CondNumber(NRMat<double> &A, const char norm)
 {
        const char TypNorm = (tolower(norm) == 'o')?'I':'O'; //switch c-order/fortran-order
-        const int N = A.nrows();
+        const FINT N = A.nrows();
        double Norma(0.0), ret(0.0);
-        int info;
+        FINT info;
        double *work;
        double *rwork;

--- a/smat.cc
+++ b/smat.cc
@@ -44,6 +44,16 @@ namespace LA {
 template <typename T>
 void NRSMat<T>::put(int fd, bool dim, bool transp) const
 {
+#ifdef CUDALA
+if(location!=cpu)
+        {
+        NRSMat<T> tmp= *this;
+        tmp.moveto(cpu);
+        tmp.put(fd,dim,transp);
+        return;
+        }
+#endif
+
 errno=0;
 if(dim)
 {
@@ -56,6 +66,18 @@ LA_traits<T>::multiput(NN2,fd,v,dim);
 template <typename T>
 void NRSMat<T>::get(int fd, bool dim, bool transp)
 {
+#ifdef CUDALA
+if(location!=cpu)
+        {
+        NRSMat<T> tmp;
+        tmp.moveto(cpu);
+        tmp.get(fd,dim,transp);
+        tmp.moveto(location);
+        *this = tmp;
+        return;
+        }
+#endif
+
 int nn0[2]; //align at least 8-byte
 errno=0;
 if(dim)
@@ -402,6 +424,9 @@ cblas_dcopy(nn*(nn+1)/2,&rhs(0,0),1,((double *)v) + (imagpart?1:0),2);
 }


+//some template specializations leading to BLAS/CUBLAS calls
+
+


 //////////////////////////////////////////////////////////////////////////////
--- a/smat.h
+++ b/smat.h
@@ -29,12 +29,20 @@ protected:
 	int nn;
 	T *v;
 	int *count;
+#ifdef CUDALA
+       GPUID location;
+#endif
 public:
 	friend class NRVec<T>;
 	friend class NRMat<T>;
 	
-	inline NRSMat() : nn(0),v(0),count(0) {};
-	inline explicit NRSMat(const int n);			// Zero-based array
+	inline NRSMat() : nn(0),v(0),count(0) 
+                        {
+#ifdef CUDALA
+                        location = DEFAULT_LOC;
+#endif
+                        };
+	inline explicit NRSMat(const int n, const GPUID loc= undefined);// Zero-based array
 	inline NRSMat(const T &a, const int n);	//Initialize to constant
 	inline NRSMat(const T *a, const int n);	// Initialize to array
 	inline NRSMat(const NRSMat &rhs);		// Copy constructor
@@ -45,6 +53,13 @@ public:
 	NRSMat & operator=(const NRSMat &rhs);	//assignment
        void randomize(const typename LA_traits<T>::normtype &x);
 	NRSMat & operator=(const T &a);		//assign a to diagonal
+#ifdef CUDALA
+        inline GPUID getlocation() const {return location;}
+        void moveto(const GPUID dest);
+#else
+        inline GPUID getlocation() const {return cpu;}
+        void moveto(const GPUID dest) {};
+#endif
        const bool operator!=(const NRSMat &rhs) const {if(nn!=rhs.nn) return 1; return LA_traits<T>::gencmp(v,rhs.v,NN2);} //memcmp for scalars else elementwise
        const bool operator==(const NRSMat &rhs) const {return !(*this != rhs);};
 	inline NRSMat & operator*=(const T &a);
@@ -65,8 +80,8 @@ public:
 	const NRMat<T> operator*(const NRMat<T> &rhs) const; // SMat*Mat 
 	const T dot(const NRSMat &rhs) const; // Smat.Smat//@@@for complex do conjugate
 	const T dot(const NRVec<T> &rhs) const; //Smat(as vec).vec //@@@for complex do conjugate
-	const NRVec<T> operator*(const NRVec<T> &rhs) const {NRVec<T> result(nn); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
-	const NRVec<complex<T> > operator*(const NRVec<complex<T> > &rhs) const {NRVec<complex<T> > result(nn); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
+	const NRVec<T> operator*(const NRVec<T> &rhs) const {NRVec<T> result(nn,rhs.getlocation()); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
+	const NRVec<complex<T> > operator*(const NRVec<complex<T> > &rhs) const {NRVec<complex<T> > result(nn,rhs.getlocation()); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
 	const T* diagonalof(NRVec<T> &, const bool divide=0, bool cache=false) const; //get diagonal
 	void gemv(const T beta, NRVec<T> &r, const char trans, const T alpha, const NRVec<T> &x) const {r.gemv(beta,*this,trans,alpha,x);};
 	void gemv(const T beta, NRVec<complex<T> > &r, const char trans, const T alpha, const NRVec<complex<T> > &x) const {r.gemv(beta,*this,trans,alpha,x);};
@@ -108,29 +123,63 @@ namespace LA {

 // ctors
 template <typename T>
-inline NRSMat<T>::NRSMat(const int n) : nn(n), v(new T[NN2]),
-				count(new int) {*count = 1;}
-
-template <typename T>
-inline NRSMat<T>::NRSMat(const T& a, const int n) : nn(n),
-	        v(new T[NN2]), count(new int)
+inline NRSMat<T>::NRSMat(const int n, const GPUID loc) : nn(n), count(new int(1)) 
 {
-	*count =1;
-	if(a != (T)0) for(int i=0; i<NN2; i++) v[i] = a;
-	else memset(v, 0, NN2*sizeof(T));
+#ifdef CUDALA
+        location= (loc==undefined?DEFAULT_LOC:loc);
+        if(location==cpu)
+#endif
+	v=new T[NN2];
+#ifdef CUDALA
+	else v= (T*) gpualloc(NN2*sizeof(T));
+#endif
 }

 template <typename T>
-inline NRSMat<T>::NRSMat(const T *a, const int n) : nn(n),
-	        v(new T[NN2]), count(new int)
+inline NRSMat<T>::NRSMat(const T& a, const int n) : nn(n), count(new int(1))
 {
-	*count = 1;
-	memcpy(v, a, NN2*sizeof(T));
+#ifdef CUDALA
+        location=DEFAULT_LOC;
+	if(location==cpu)
+#endif
+        {
+	v=new T[NN2];
+	if(a != (T)0) for(int i=0; i<NN2; i++) v[i] = a;
+	else memset(v, 0, NN2*sizeof(T));
+	}
+#ifdef CUDALA
+	else
+	{
+	v= (T*) gpualloc(NN2*sizeof(T));
+	cublasSetVector(NN2,sizeof(T),&a,0,v,1);
+	}
+#endif
+}
+
+template <typename T>
+inline NRSMat<T>::NRSMat(const T *a, const int n) : nn(n), count(new int(1))
+{
+#ifdef CUDALA
+        location=DEFAULT_LOC;
+        if(location==cpu)
+#endif
+		memcpy(v, a, NN2*sizeof(T));
+#ifdef CUDALA
+        else
+        {
+        v= (T*) gpualloc(NN2*sizeof(T));
+        cublasSetVector(NN2,sizeof(T),a,1,v,1);
+        }
+#endif
+
 }

 template <typename T>
 inline NRSMat<T>::NRSMat(const NRSMat<T> &rhs) //copy constructor
 {
+#ifdef CUDALA
+        location=rhs.location;
+#endif
 	v = rhs.v;
 	nn = rhs.nn;
 	count = rhs.count;
@@ -140,6 +189,9 @@ inline NRSMat<T>::NRSMat(const NRSMat<T> &rhs) //copy constructor
 template <typename T>
 NRSMat<T>::NRSMat(const NRVec<T> &rhs, const int n) // type conversion
 {
+#ifdef CUDALA
+        location=rhs.location;
+#endif
 	nn = n;
 #ifdef DEBUG
 	if (NN2 != rhs.size())
@@ -150,6 +202,7 @@ NRSMat<T>::NRSMat(const NRVec<T> &rhs, const int n) // type conversion
 	(*count)++;
 }

+
 // S *= a
 template<>
 inline NRSMat<double> & NRSMat<double>::operator*=(const double & a)
@@ -437,33 +490,31 @@ NRSMat<T>::~NRSMat()
 {
        if (!count) return;
        if (--(*count) <= 0) {
-                if (v) delete[] (v);
+                if (v) 
+		    {
+#ifdef CUDALA
+                    if(location==cpu)
+#endif
+			delete[] v;
+#ifdef CUDALA
+                        else gpufree(v);
+#endif
+                    }
                delete count;
        }
 }

+
 // assignment with a physical copy
 template <typename T>
 NRSMat<T> & NRSMat<T>::operator|=(const NRSMat<T> &rhs)
 {
-        if (this != &rhs) {
-                if(!rhs.v) laerror("unallocated rhs in NRSMat operator |=");
-                if(count)
-                        if(*count > 1) {        // detach from the other
-                                --(*count);
-                                nn = 0;
-                                count = 0;
-                                v = 0;
-                        }
-                if (nn != rhs.nn) {
-                        if(v) delete [] (v);
-                        nn = rhs.nn;
-                }
-                if (!v) v = new T[NN2];
-                if (!count) count = new int;
-                *count = 1;
-                memcpy(v, rhs.v, NN2*sizeof(T));
-        }
+#ifdef DEBUG
+                if (!rhs.v) laerror("unallocated rhs in NRSMat operator |=");
+#endif
+        if (this == &rhs) return *this;
+        *this = rhs;
+        this->copyonwrite();
        return *this;
 }

@@ -474,13 +525,24 @@ NRSMat<T> & NRSMat<T>::operator=(const NRSMat<T> & rhs)
 {
        if (this == & rhs) return *this;
        if (count)
-                if(--(*count) == 0) {
-                        delete [] v;
+                if(--(*count) == 0) 
+			{
+#ifdef CUDALA
+        		if(location==cpu)
+#endif
+                        	delete [] v;
+#ifdef CUDALA
+		        else
+               			 gpufree(v);
+#endif
                        delete count;
-                }
+                	}
        v = rhs.v;
        nn = rhs.nn;
        count = rhs.count;
+#ifdef CUDALA
+        location=rhs.location;
+#endif
        if (count) (*count)++;
        return *this;
 }
@@ -495,9 +557,24 @@ void NRSMat<T>::copyonwrite()
                (*count)--;
                count = new int;
                *count = 1;
-                T *newv = new T[NN2];
-                memcpy(newv, v, NN2*sizeof(T));
-                v = newv;
+		T *newv;
+#ifdef CUDALA
+	    if(location==cpu)
+       		 {
+#endif
+                 newv = new T[NN2];
+                 memcpy(newv, v, NN2*sizeof(T));
+#ifdef CUDALA
+	        }
+     	else
+        	{
+        	 newv = (T *) gpualloc(NN2*sizeof(T));
+         	if(sizeof(T)%sizeof(float)!=0) laerror("cpu memcpy alignment problem");
+         	cublasScopy(NN2*sizeof(T)/sizeof(float),(const float *) v,1,(float *)newv,1);
+        	}
+#endif
+
+                 v = newv;
        }
 }

@@ -514,7 +591,16 @@ void NRSMat<T>::resize(const int n)
 	    	if(n==0)
 	        {
 	        if(--(*count) <= 0) {
-	                if(v) delete[] (v);
+			if(v) {
+#ifdef CUDALA
+                        if(location==cpu)
+#endif
+		                 delete[] (v);
+#ifdef CUDALA
+                        else
+                                gpufree(v);
+#endif
+			}
 	                delete count;
 	                }
 	        count=0;
@@ -534,16 +620,71 @@ void NRSMat<T>::resize(const int n)
                count = new int;
                *count = 1;
                nn = n;
+#ifdef CUDALA
+     if(location==cpu)
+#endif
                v = new T[NN2];
+#ifdef CUDALA
+      else
+        v = (T*) gpualloc(NN2*sizeof(T));
+#endif
+
                return;
        }
        if (n != nn) {
-                nn = n;
-                delete[] v;
-                v = new T[NN2];
+              nn = n;
+#ifdef CUDALA
+	     if(location==cpu)
+#endif
+			{
+                	delete[] v;
+                	v = new T[NN2];
+			}
+#ifdef CUDALA
+	      else
+       			 {
+			 gpufree(v);
+			 v = (T*) gpualloc(NN2*sizeof(T));
+		         }
+#endif
+
        }
 }

+#ifdef CUDALA
+template<typename T>
+void NRSMat<T>::moveto(const GPUID dest)
+{
+if(location==dest) return;
+location=dest;
+
+if(v && !count) laerror("internal inconsistency of reference counting 1");
+if (!count) return;
+
+if(v && *count==0) laerror("internal inconsistency of reference counting 2");
+if(!v) return;
+
+T *vold = v;
+
+if(dest == cpu) //moving from GPU to CPU
+        {
+        v = new T[NN2];
+        gpuget(NN2,sizeof(T),vold,v);
+        if(*count == 1) gpufree(vold);
+        else {--(*count); count = new int(1);}
+        }
+else    //moving from CPU to GPU
+        {
+        v=(T *) gpualloc(NN2*sizeof(T));
+        gpuput(NN2,sizeof(T),vold,v);
+        if(*count == 1) delete[] vold;
+        else {--(*count); count = new int(1);}
+        }
+}
+#endif
+
+
+

 template<typename T>
 NRSMat<complex<T> > complexify(const NRSMat<T> &rhs)
@@ -554,10 +695,15 @@ for(int i=0; i<rhs.nrows(); ++i)
 return r;
 }

+
 // I/O
 template <typename T>
 std::ostream& operator<<(std::ostream &s, const NRSMat<T> &x)
                {
+#ifdef CUDALA
+if(x.getlocation()==cpu)
+   {
+#endif
                int i,j,n;
                n=x.nrows();
                s << n << ' ' << n << '\n';
@@ -566,12 +712,25 @@ std::ostream& operator<<(std::ostream &s, const NRSMat<T> &x)
                        for(j=0; j<n;j++) s << (typename LA_traits_io<T>::IOtype)x(i,j) << (j==n-1 ? '\n' : ' ');
                        }
                return s;
+#ifdef CUDALA
+   }
+else
+    {
+    NRSMat<T> tmp=x;
+    tmp.moveto(cpu);
+    return s<<tmp;
+    }
+#endif
                }


 template <typename T>
 std::istream& operator>>(std::istream  &s, NRSMat<T> &x)
                {
+#ifdef CUDALA
+if(x.getlocation()==cpu)
+  {
+#endif
                int i,j,n,m;
                s >> n >> m;
                if(n!=m) laerror("input symmetric matrix not square");
@@ -579,6 +738,18 @@ std::istream& operator>>(std::istream  &s, NRSMat<T> &x)
 		typename LA_traits_io<T>::IOtype tmp;
                for(i=0;i<n;i++) for(j=0; j<m;j++) {s>>tmp; x(i,j)=tmp;}
                return s;
+#ifdef CUDALA
+  }
+else
+                {
+                NRSMat<T> tmp;
+                tmp.moveto(cpu);
+                s >> tmp;
+                tmp.moveto(x.getlocation());
+                x=tmp;
+                return s;
+                }
+#endif
                }


--- a/t.cc
+++ b/t.cc
@@ -24,6 +24,7 @@
 using namespace std;
 using namespace LA;

+
 extern void test(const NRVec<double> &x);


@@ -1551,7 +1552,7 @@ cout <<"Error = "<<(cx.transpose(true)*cx -bb).norm()<<endl;
 }


-if(1)
+if(0)
 {
 int n;
 cin >>n;
@@ -1574,5 +1575,59 @@ cout <<"Cholesky factor filling = "<<b.simplify()<<endl;
 }


+if(0)
+{
+int n;
+cin >>n;
+NRMat<double> a(n,n),b(n,n);
+a.randomize(1.);
+b.randomize(1.);
+NRMat<double>c;
+double t=clock()/((double) (CLOCKS_PER_SEC));
+int rep=1+10000000000LL/n/n/n;
+cout <<"Repetitions " <<rep<<endl;
+for(int i=0; i<rep; ++i) c=a*b;
+cout <<"CPU time (ms) "<<(clock()/((double) (CLOCKS_PER_SEC))-t)*1000./rep <<"\n";
+NRMat<double>cgpu;
+a.moveto(gpu1);
+b.moveto(gpu1);
+ t=clock()/((double) (CLOCKS_PER_SEC));
+for(int i=0; i<rep; ++i) cgpu=a*b;
+cout <<"GPU time (ms) "<<(clock()/((double) (CLOCKS_PER_SEC))-t)*1000./rep <<"\n";
+cgpu.moveto(cpu);
+cout << "Error = "<<(c-cgpu).norm()<<endl;
+}
+
+if(1)
+{
+int n;
+cin >>n;
+NRMat<double> a(n,n);
+a.randomize(1.);
+NRVec<double> v(n);
+v.randomize(1.);
+NRSMat<double> as(n);
+as.randomize(1.);
+NRVec<double>w = a*v;
+NRVec<double>ws = as*v;
+NRMat<double>c(n,n);
+c=exp(a);
+a.moveto(gpu1);
+v.moveto(gpu1);
+as.moveto(gpu1);
+NRVec<double>wgpu = a*v;
+NRVec<double>wsgpu = as*v;
+w.moveto(gpu1);
+ws.moveto(gpu1);
+cout << "Error gemv = "<<(w-wgpu).norm()<<endl;
+cout << "Error symv = "<<(ws-wsgpu).norm()<<endl;
+NRMat<double>cgpu;
+cgpu=exp(a);
+c.moveto(gpu1);
+cout << "Error = "<<(c-cgpu).norm()<<endl;
+}
+
+
+

 }
--- a/vec.cc
+++ b/vec.cc
@@ -84,6 +84,16 @@ NRVec<T>::NRVec(const NRMat<T> &rhs)
 template <typename T>
 void NRVec<T>::put(int fd, bool dim, bool transp) const
 {
+#ifdef CUDALA
+if(location!=cpu)
+        {
+        NRVec<T> tmp= *this;
+        tmp.moveto(cpu);
+        tmp.put(fd,dim,transp);
+        return;
+        }
+#endif
+
 errno=0;
 int pad=1; //align at least 8-byte
 if(dim)
@@ -94,9 +104,21 @@ if(sizeof(int) != write(fd,&pad,sizeof(int))) laerror("cannot write");
 LA_traits<T>::multiput(nn,fd,v,dim);
 }

+
 template <typename T>
 void NRVec<T>::get(int fd, bool dim, bool transp)
 {
+#ifdef CUDALA
+if(location!=cpu)
+        {
+        NRVec<T> tmp;
+        tmp.moveto(cpu);
+        tmp.get(fd,dim,transp);
+        tmp.moveto(location);
+        *this = tmp;
+        return;
+        }
+#endif
 int nn0[2]; //align at least 8-byte
 errno=0;
 if(dim)
@@ -319,9 +341,16 @@ void NRVec<double>::gemv(const double beta, const NRMat<double> &A,
 	if ((trans == 'n'?A.ncols():A.nrows()) != x.size())
 		laerror("incompatible sizes in gemv A*x");
 #endif
+	SAME_LOC3(*this,x,A);
 	copyonwrite();
-	cblas_dgemv(CblasRowMajor, (trans=='n' ? CblasNoTrans:CblasTrans),
-			A.nrows(), A.ncols(), alpha, A, A.ncols(), x.v, 1, beta, v, 1);
+#ifdef CUDALA
+	if(location==cpu)
+#endif
+		cblas_dgemv(CblasRowMajor, (trans=='n' ? CblasNoTrans:CblasTrans), A.nrows(), A.ncols(), alpha, A, A.ncols(), x.v, 1, beta, v, 1);
+#ifdef CUDALA
+        else
+		cublasDgemv((trans=='n' ?'T':'N'),A.ncols(), A.nrows(),alpha, A, A.ncols(), x.v, 1, beta, v, 1);
+#endif
 }


@@ -362,8 +391,16 @@ void NRVec<double>::gemv(const double beta, const NRSMat<double> &A,
 #ifdef DEBUG
 	if (A.ncols()!=x.size()) laerror("incompatible dimension in gemv A*x");
 #endif
+SAME_LOC3(*this,A,x);
 	copyonwrite();
+#ifdef CUDALA
+if(location==cpu)
+#endif
 	cblas_dspmv(CblasRowMajor, CblasLower, A.ncols(), alpha, A, x.v, 1, beta, v, 1);
+#ifdef CUDALA
+else
+	cublasDspmv('U',A.ncols(), alpha, A, x.v, 1, beta, v, 1);
+#endif
 }

 template<>
@@ -420,6 +457,7 @@ NRVec<complex<double> >::otimes(const NRVec< complex<double> > &b, const bool co
 template<typename T>
 int NRVec<T>::sort(int direction, int from, int to, int *perm)
 {
+NOT_GPU(*this);
 copyonwrite();
 if(to == -1) to=nn-1;
 if(direction) return memqsort<1,NRVec<T>,int,int>(*this,perm,from,to);
@@ -427,6 +465,10 @@ else return memqsort<0,NRVec<T>,int,int>(*this,perm,from,to);
 }


+
+
+
+
 //////////////////////////////////////////////////////////////////////////////
 //// forced instantization in the corespoding object file

--- a/vec.h
+++ b/vec.h
@@ -30,6 +30,9 @@ template <typename T> void lawritemat(FILE *file,const T *a,int r,int c,

 // Memory allocated constants for cblas routines
 const static complex<double> CONE = 1.0, CMONE = -1.0, CZERO = 0.0;
+#ifdef CUDALA
+const static cuDoubleComplex CUONE = {1.,0.}, CUMONE = {-1.,0.}, CUZERO = {0.,0.};
+#endif

 // Macros to construct binary operators +,-,*, from +=, -=, *=
 // for 3 cases: X + a, a + X, X + Y
@@ -44,7 +47,7 @@ template<class T> \

 #define NRVECMAT_OPER2(E,X) \
 template<class T> \
-	inline const NR##E<T> NR##E<T>::operator X(const NR##E<T> &a) const \
+inline const NR##E<T> NR##E<T>::operator X(const NR##E<T> &a) const \
 { return NR##E(*this) X##= a; }


@@ -55,12 +58,32 @@ protected:
 	int nn;
 	T *v;
 	int *count;
+#ifdef CUDALA
+	GPUID location;
+#endif
 public:
 	friend class NRSMat<T>;
 	friend class NRMat<T>;

-	inline NRVec(): nn(0),v(0),count(0){};
-	explicit inline NRVec(const int n) : nn(n), v(new T[n]), count(new int(1)) {};
+	inline NRVec(): nn(0),v(0),count(0)
+                        {
+#ifdef CUDALA
+                        location = DEFAULT_LOC;
+#endif
+                        };
+	explicit inline NRVec(const int n, const GPUID loc= undefined) : nn(n), count(new int(1)) 
+                        {
+#ifdef CUDALA
+                        if(loc==undefined) location = DEFAULT_LOC; else location = loc;
+			if(location==cpu)
+#endif
+				v= new T[n];
+#ifdef CUDALA
+			else
+				v= (T*) gpualloc(n*sizeof(T));
+#endif
+                        };
+
 	inline NRVec(const T &a, const int n);
        inline NRVec(const T *a, const int n);
 	inline NRVec(T *a, const int n, bool skeleton);
@@ -71,6 +94,13 @@ public:
 	explicit NRVec(const NRMat<T> &rhs) : NRVec(&rhs[0][0],rhs.nrows()*rhs.ncols()) {};
 #else
 	explicit NRVec(const NRMat<T> &rhs);
+#endif
+#ifdef CUDALA
+        inline GPUID getlocation() const {return location;}
+        void moveto(const GPUID dest);
+#else
+        inline GPUID getlocation() const {return cpu;}
+        void moveto(const GPUID dest) {};
 #endif
 	NRVec & operator=(const NRVec &rhs);
 	NRVec & operator=(const T &a);  //assign a to every element
@@ -103,8 +133,8 @@ public:
 	void gemv(const T beta, const SparseMat<T> &a, const char trans, const T alpha, const NRVec &x,const bool treat_as_symmetric=false);
 	void gemv(const typename LA_traits_complex<T>::Component_type beta, const typename LA_traits_complex<T>::NRMat_Noncomplex_type &a, const char trans, const typename LA_traits_complex<T>::Component_type alpha, const NRVec &x);
 	void gemv(const typename LA_traits_complex<T>::Component_type beta, const typename LA_traits_complex<T>::NRSMat_Noncomplex_type &a, const char trans, const typename LA_traits_complex<T>::Component_type alpha, const NRVec &x);
-	const NRVec operator*(const NRMat<T> &mat) const {NRVec<T> result(mat.ncols()); result.gemv((T)0,mat,'t',(T)1,*this); return result;};
-	const NRVec operator*(const NRSMat<T> &mat) const {NRVec<T> result(mat.ncols()); result.gemv((T)0,mat,'t',(T)1,*this); return result;};
+	const NRVec operator*(const NRMat<T> &mat) const {NRVec<T> result(mat.ncols(),mat.getlocation()); result.gemv((T)0,mat,'t',(T)1,*this); return result;};
+	const NRVec operator*(const NRSMat<T> &mat) const {NRVec<T> result(mat.ncols(),mat.getlocation()); result.gemv((T)0,mat,'t',(T)1,*this); return result;};
 	const NRVec operator*(const SparseMat<T> &mat) const {NRVec<T> result(mat.ncols()); result.gemv((T)0,mat,'t',(T)1,*this); return result;};
 	const NRMat<T> otimes(const NRVec<T> &rhs, const bool conjugate=false, const T &scale=1) const; //outer product
 	inline const NRMat<T> operator|(const NRVec<T> &rhs) const {return otimes(rhs,true);};
@@ -150,29 +180,58 @@ public:
 #include "sparsemat.h"
 #include "sparsesmat.h"

+
+
 namespace LA {
 // formatted I/O
 template <typename T>
 std::ostream & operator<<(std::ostream &s, const NRVec<T> &x)
 {
+#ifdef CUDALA
+if(x.getlocation()==cpu)
+   {
+#endif
  int i, n;
-
  n = x.size();
  s << n << std::endl;
  for(i=0; i<n; i++) s << (typename LA_traits_io<T>::IOtype)x[i] << (i == n-1 ? '\n' : ' ');
  return s;
+#ifdef CUDALA
+   }
+else
+    {
+    NRVec<T> tmp=x;
+    tmp.moveto(cpu);
+    return s<<tmp;
+    }
+#endif
 }

 template <typename T>
 std::istream & operator>>(std::istream &s, NRVec<T> &x)
 {
+#ifdef CUDALA
+if(x.getlocation()==cpu)
+  {
+#endif
  int i,n;
-
  s >> n;
  x.resize(n);
  typename LA_traits_io<T>::IOtype tmp;
  for(i=0; i<n; i++) {s >> tmp; x[i]=tmp;}
  return s;
+#ifdef CUDALA
+  }
+else
+                {
+                NRVec<T> tmp;
+                tmp.moveto(cpu);
+                s >> tmp;
+                tmp.moveto(x.getlocation());
+                x=tmp;
+                return s;
+                }
+#endif
 }


@@ -180,22 +239,51 @@ std::istream & operator>>(std::istream &s, NRVec<T> &x)

 // ctors
 template <typename T>
-inline NRVec<T>::NRVec(const T& a, const int n) : nn(n), v(new T[n]), count(new int)
+inline NRVec<T>::NRVec(const T& a, const int n) : nn(n), count(new int)
 {
 	*count = 1;
+#ifdef CUDALA
+	location=DEFAULT_LOC;
+    if(location==cpu)
+	{
+#endif
+	v = new T[n];
 	if(a != (T)0)
 		for(int i=0; i<n; i++)
 			v[i] = a;
 	else
 		memset(v, 0, nn*sizeof(T));
+#ifdef CUDALA
+	}
+    else
+	{
+	v= (T*) gpualloc(n*sizeof(T));	
+	cublasSetVector(n,sizeof(T),&a,0,v,1);
+	}
+#endif
 }

+
 template <typename T>
 inline NRVec<T>::NRVec(const T *a, const int n) : nn(n), count(new int)
 {
-		v=new T[n];
-		*count = 1;
-		memcpy(v, a, n*sizeof(T));
+#ifdef CUDALA
+location=DEFAULT_LOC;
+    if(location==cpu)
+        {
+#endif
+	v=new T[n];
+	*count = 1;
+	memcpy(v, a, n*sizeof(T));
+#ifdef CUDALA
+        }
+    else
+        {
+        v= (T*) gpualloc(n*sizeof(T));
+        cublasSetVector(n,sizeof(T),a,1,v,1);
+        }
+#endif
+
 }

 template <typename T>
@@ -203,12 +291,28 @@ inline NRVec<T>::NRVec(T *a, const int n, bool skeleton) : nn(n), count(new int)
 {
 	if(!skeleton)
 		{
+#ifdef CUDALA
+location=DEFAULT_LOC;
+    if(location==cpu)
+        {
+#endif
 		v=new T[n];
 		*count = 1;
 		memcpy(v, a, n*sizeof(T));
+#ifdef CUDALA
+        }
+    else
+        {
+        v= (T*) gpualloc(n*sizeof(T));
+        cublasSetVector(n,sizeof(T),a,1,v,1);
+        }
+#endif
 		}
 	else
 		{
+#ifdef CUDALA
+		if(location!=cpu) laerror("NRVec() with skeleton option cannot be on GPU");
+#endif
 		*count = 2;
 		v=a;
 		}
@@ -217,6 +321,9 @@ inline NRVec<T>::NRVec(T *a, const int n, bool skeleton) : nn(n), count(new int)
 template <typename T>
 inline NRVec<T>::NRVec(const NRVec<T> &rhs)
 {
+#ifdef CUDALA
+	location=rhs.location;
+#endif
 	v = rhs.v;
 	nn = rhs.nn;
 	count = rhs.count;
@@ -226,6 +333,9 @@ inline NRVec<T>::NRVec(const NRVec<T> &rhs)
 template <typename T>
 inline NRVec<T>::NRVec(const NRSMat<T> &rhs)
 {
+#ifdef CUDALA
+        location=rhs.location;
+#endif
 	nn = rhs.nn;
 	nn = NN2;
 	v = rhs.v;
@@ -233,28 +343,11 @@ inline NRVec<T>::NRVec(const NRSMat<T> &rhs)
 	(*count)++;
 }

-// x += a
-template<>
-inline NRVec<double> & NRVec<double>::operator+=(const double &a)
-{
-	copyonwrite();
-	cblas_daxpy(nn, 1.0, &a, 0, v, 1);
-	return *this;
-}
-
-template<>
-inline NRVec< complex<double> > &
-NRVec< complex<double> >::operator+=(const complex<double> &a)
-{
-	copyonwrite();
-	cblas_zaxpy(nn, &CONE, &a, 0, v, 1);
-	return *this;
-}
-
-//and for general type
+// x +/-= a
 template <typename T>
 inline NRVec<T> & NRVec<T>::operator+=(const T &a)
 {
+	NOT_GPU(*this);
        copyonwrite();
 	int i;
 	for(i=0; i<nn; ++i) v[i]+=a;
@@ -262,65 +355,26 @@ inline NRVec<T> & NRVec<T>::operator+=(const T &a)
 }


-// x -= a
-template<>
-inline NRVec<double> & NRVec<double>::operator-=(const double &a)
-{
-	copyonwrite();
-	cblas_daxpy(nn, -1.0, &a, 0, v, 1);
-	return *this;
-}
-
-template<>
-inline NRVec< complex<double> > &
-NRVec< complex<double> >::operator-=(const complex<double> &a)
-{
-	copyonwrite();
-	cblas_zaxpy(nn, &CMONE, &a, 0, v, 1);
-	return *this;
-}
-
-//and for general type
 template <typename T>
 inline NRVec<T> & NRVec<T>::operator-=(const T &a)
 {
+	NOT_GPU(*this);
        copyonwrite();
-        int i;
-        for(i=0; i<nn; ++i) v[i]-=a;
+	int i;
+	for(i=0; i<nn; ++i) v[i]-=a;
        return *this;
 }


+
 // x += x
-template<>
-inline NRVec<double> & NRVec<double>::operator+=(const NRVec<double> &rhs)
-{
-#ifdef DEBUG
-	if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
-#endif
-	copyonwrite();
-	cblas_daxpy(nn, 1.0, rhs.v, 1, v, 1);
-	return *this;
-}
-
-template<>
-inline NRVec< complex<double> > &
-NRVec< complex<double> >::operator+=(const NRVec< complex<double> > &rhs)
-{
-#ifdef DEBUG
-	if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
-#endif
-	copyonwrite();
-	cblas_zaxpy(nn, &CONE, rhs.v, 1, v, 1);
-	return *this;
-}
-
-//and for general type
 template <typename T>
 inline NRVec<T> & NRVec<T>::operator+=(const NRVec<T> &rhs)
 {
 #ifdef DEBUG
        if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
+NOT_GPU(*this);
+NOT_GPU(rhs);
 #endif
        copyonwrite();
 	int i;
@@ -346,6 +400,8 @@ inline NRVec<T> & NRVec<T>::operator/=(const NRVec<T> &rhs)
 {
 #ifdef DEBUG
        if (nn != rhs.nn) laerror("/= of incompatible vectors");
+NOT_GPU(*this);
+NOT_GPU(rhs);
 #endif
        copyonwrite();
        int i;
@@ -356,35 +412,13 @@ inline NRVec<T> & NRVec<T>::operator/=(const NRVec<T> &rhs)


 // x -= x
-template<>
-inline NRVec<double> & NRVec<double>::operator-=(const NRVec<double> &rhs)
-{
-#ifdef DEBUG
-	if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
-#endif
-	copyonwrite();
-	cblas_daxpy(nn, -1.0, rhs.v, 1, v, 1);
-	return *this;
-}
-
-template<>
-inline NRVec< complex<double> > &
-NRVec< complex<double> >::operator-=(const NRVec< complex<double> > &rhs)
-{
-#ifdef DEBUG
-	if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
-#endif
-	copyonwrite();
-	cblas_zaxpy(nn, &CMONE, rhs.v, 1, v, 1);
-	return *this;
-}
-
-//and for general type
 template <typename T>
 inline NRVec<T> & NRVec<T>::operator-=(const NRVec<T> &rhs)
 {
 #ifdef DEBUG
        if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
+NOT_GPU(*this);
+NOT_GPU(rhs);
 #endif
        copyonwrite();
        int i;
@@ -394,27 +428,10 @@ inline NRVec<T> & NRVec<T>::operator-=(const NRVec<T> &rhs)


 // x *= a
-template<>
-inline NRVec<double> & NRVec<double>::operator*=(const double &a)
-{
-	copyonwrite();
-	cblas_dscal(nn, a, v, 1);
-	return *this;
-}
-
-template<>
-inline NRVec< complex<double> > &
-NRVec< complex<double> >::operator*=(const complex<double> &a)
-{
-	copyonwrite();
-	cblas_zscal(nn, &a, v, 1);
-	return *this;
-}
-
-//and for general type
 template <typename T>
 inline NRVec<T> & NRVec<T>::operator*=(const T &a)
 {
+NOT_GPU(*this);
        copyonwrite();
        int i;
        for(i=0; i<nn; ++i) v[i]*=a;
@@ -423,33 +440,13 @@ inline NRVec<T> & NRVec<T>::operator*=(const T &a)


 // scalar product x.y
-template<>
-inline const double NRVec<double>::operator*(const NRVec<double> &rhs) const
-{
-#ifdef DEBUG
-        if (nn != rhs.nn) laerror("dot of incompatible vectors");
-#endif
-        return cblas_ddot(nn, v, 1, rhs.v, 1);
-}
-
-
-template<>
-inline const complex<double>
-NRVec< complex<double> >::operator*(const NRVec< complex<double> > &rhs) const
-{
-#ifdef DEBUG
-	if (nn != rhs.nn) laerror("dot of incompatible vectors");
-#endif
-	complex<double> dot;
-	cblas_zdotc_sub(nn, v, 1, rhs.v, 1, &dot);
-	return dot;
-}
-
 template<typename T>
 inline const T NRVec<T>::operator*(const NRVec<T> &rhs) const
 {
 #ifdef DEBUG
        if (nn != rhs.nn) laerror("dot of incompatible vectors");
+NOT_GPU(*this);
+NOT_GPU(rhs);
 #endif
 	T dot = 0;
 	for(int i=0; i<nn; ++i) dot+= v[i]*rhs.v[i];
@@ -458,28 +455,6 @@ inline const T NRVec<T>::operator*(const NRVec<T> &rhs) const



-// Sum of elements
-template<>
-inline const double NRVec<double>::asum() const
-{
-	return cblas_dasum(nn, v, 1);
-}
-
-
-// Dot product: x * y
-template<>
-inline const double NRVec<double>::dot(const double *y, const int stride) const
-{
-	return cblas_ddot(nn, y, stride, v, 1);
-}
-template<>
-inline const complex<double>
-NRVec< complex<double> >::dot(const complex<double> *y, const int stride) const
-{
-	complex<double> dot;
-	cblas_zdotc_sub(nn, y, stride, v, 1, &dot);
-	return dot;
-}

 // x[i] returns i-th element
 template <typename T>
@@ -489,6 +464,7 @@ inline T & NRVec<T>::operator[](const int i)
 	if(_LA_count_check && *count != 1) laerror("possible lval [] with count > 1");
 	if(i < 0 || i >= nn) laerror("NRVec out of range");
 	if(!v) laerror("[] on unallocated NRVec");
+NOT_GPU(*this);
 #endif
 	return v[i];
 }
@@ -498,6 +474,7 @@ inline const T & NRVec<T>::operator[](const int i) const
 #ifdef DEBUG
 	if(i < 0 || i >= nn) laerror("NRVec out of range");
 	if(!v) laerror("[] on unallocated NRVec");
+NOT_GPU(*this);
 #endif
 	return v[i];
 }
@@ -527,29 +504,6 @@ inline NRVec<T>::operator const T*() const
 	return v;
 }

-// return norm of the Vec
-template<>
-inline const double  NRVec<double>::norm() const
-{
-	return cblas_dnrm2(nn, v, 1);
-}
-template<>
-inline const double NRVec< complex<double> >::norm() const
-{
-	return cblas_dznrm2(nn, v, 1);
-}
-
-// Max element of the array
-template<>
-inline const double  NRVec<double>::amax() const
-{
-	return v[cblas_idamax(nn, v, 1)];
-}
-template<>
-inline const complex<double> NRVec< complex<double> >::amax() const
-{
-	return v[cblas_izamax(nn, v, 1)];
-}


 // Make Vec unitvector
@@ -576,7 +530,16 @@ NRVec<T>::~NRVec()
 {
        if(!count) return;
        if(--(*count) <= 0) {
-                if(v) delete[] (v);
+                if(v) 
+			{
+#ifdef CUDALA
+                    if(location==cpu)
+#endif
+			delete[] (v);
+#ifdef CUDALA
+			else gpufree(v);
+#endif
+			}
                delete count;
        }
 }
@@ -591,12 +554,29 @@ void NRVec<T>::copyonwrite()
    (*count)--;
    count = new int;
    *count = 1;
-    T *newv = new T[nn];
-    memcpy(newv, v, nn*sizeof(T));
+    T *newv;
+#ifdef CUDALA
+    if(location==cpu) 
+	{
+#endif
+	newv = new T[nn];
+    	memcpy(newv, v, nn*sizeof(T));
+#ifdef CUDALA
+        }
+     else 
+        {
+         newv = (T *) gpualloc(nn*sizeof(T));
+         if(sizeof(T)%sizeof(float)!=0) laerror("cpu memcpy alignment problem");
+         cublasScopy(nn*sizeof(T)/sizeof(float),(const float *) v,1,(float *)newv,1);
+        }
+#endif
+
+
    v = newv;
  }
 }

+
 // Asignment
 template <typename T>
 NRVec<T> & NRVec<T>::operator=(const NRVec<T> &rhs)
@@ -606,17 +586,29 @@ NRVec<T> & NRVec<T>::operator=(const NRVec<T> &rhs)
    if(count)
      if(--(*count) == 0)
      {
-        delete[] v;
+#ifdef CUDALA
+        if(location==cpu)
+#endif
+        	delete[] v;
+#ifdef CUDALA
+	else 
+		gpufree(v);
+#endif
        delete count;
      }
    v = rhs.v;
    nn = rhs.nn;
    count = rhs.count;
+#ifdef CUDALA
+    location=rhs.location;
+#endif
    if(count) (*count)++;
  }
  return *this;
 }

+
+
 // Resize
 template <typename T>
 void NRVec<T>::resize(const int n)
@@ -629,7 +621,17 @@ void NRVec<T>::resize(const int n)
    if(n==0)
 	{
 	if(--(*count) <= 0) {
-                if(v) delete[] (v);
+                if(v) 
+			{
+#ifdef CUDALA
+			if(location==cpu)
+#endif
+				delete[] (v);
+#ifdef CUDALA
+			else
+				gpufree(v);
+#endif
+			}
                delete count;
        	}
 	count=0;
@@ -648,14 +650,33 @@ void NRVec<T>::resize(const int n)
    count = new int;
    *count = 1;
    nn = n;
-    v = new T[nn];
+#ifdef CUDALA
+     if(location==cpu)
+#endif
+    	v = new T[nn];
+#ifdef CUDALA
+      else
+        v = (T*) gpualloc(nn*sizeof(T));
+#endif
    return;
  }
  // *count = 1 in this branch
  if (n != nn) {
    nn = n;
-    delete[] v;
-    v = new T[nn];
+#ifdef CUDALA
+     if(location==cpu)
+#endif
+  	{
+    	delete[] v;
+    	v = new T[nn];
+	}
+#ifdef CUDALA
+      else
+	{
+	gpufree(v);
+	v = (T*) gpualloc(nn*sizeof(T));
+	}
+#endif
  }
 }

@@ -664,30 +685,18 @@ void NRVec<T>::resize(const int n)
 template <typename T>
 NRVec<T> & NRVec<T>::operator|=(const NRVec<T> &rhs)
 {
-	if (this != &rhs) {
 #ifdef DEBUG
 		if (!rhs.v) laerror("unallocated rhs in NRVec operator |=");
 #endif
-		if (count)
-			if (*count > 1) {
-				--(*count);
-				nn = 0;
-				count = 0;
-				v = 0;
-			}
-		if (nn != rhs.nn) {
-			if (v) delete[] (v);
-			nn = rhs.nn;
-		}
-		if(!v) v = new T[nn];
-		if(!count) count = new int;
-		*count = 1;
-		memcpy(v, rhs.v, nn*sizeof(T));
-	}
-	return *this;
+        if (this == &rhs) return *this;
+        *this = rhs;
+        this->copyonwrite();
+        return *this;
 }


+
+
 template<typename T>
 NRVec<complex<T> > complexify(const NRVec<T> &rhs)
 {
@@ -696,6 +705,291 @@ for(int i=0; i<rhs.size(); ++i)  r[i]=rhs[i];
 return r;
 }

+
+#ifdef CUDALA
+template<typename T>
+void NRVec<T>::moveto(const GPUID dest)
+{
+if(location==dest) return;
+location=dest;
+
+if(v && !count) laerror("internal inconsistency of reference counting 1");
+if (!count) return;
+
+if(v && *count==0) laerror("internal inconsistency of reference counting 2");
+if(!v) return;
+
+T *vold = v;
+
+if(dest == cpu) //moving from GPU to CPU
+        {
+        v = new T[nn];
+        gpuget(nn,sizeof(T),vold,v);
+        if(*count == 1) gpufree(vold);
+        else {--(*count); count = new int(1);}
+        }
+else    //moving from CPU to GPU
+        {
+        v=(T *) gpualloc(nn*sizeof(T));
+        gpuput(nn,sizeof(T),vold,v);
+        if(*count == 1) delete[] vold;
+        else {--(*count); count = new int(1);}
+        }
+}
+#endif
+
+
+//some template specializations leading to BLAS/CUBLAS calls
+template<>
+inline
+NRVec<double> & NRVec<double>::operator+=(const double &a)
+{
+	copyonwrite();
+#ifdef CUDALA
+	if(location==cpu)
+#endif
+		cblas_daxpy(nn, 1.0, &a, 0, v, 1);
+#ifdef CUDALA
+	else
+		{
+		double *d=gpuputdouble(a);
+		cublasDaxpy(nn, 1.0, d, 0, v, 1);
+		gpufree(d);
+		}
+#endif
+	return *this;
+}
+
+template<>
+inline
+NRVec< complex<double> > &
+NRVec< complex<double> >::operator+=(const complex<double> &a)
+{
+	copyonwrite();
+#ifdef CUDALA
+        if(location==cpu)
+#endif
+		cblas_zaxpy(nn, &CONE, &a, 0, v, 1);
+#ifdef CUDALA
+        else
+                {
+                complex<double> *d=gpuputcomplex(a);
+                cublasZaxpy(nn, CUONE, (cuDoubleComplex *)d, 0, (cuDoubleComplex *)v, 1);
+                gpufree(d);
+                }
+#endif
+	return *this;
+}
+
+template<>
+inline
+NRVec<double> & NRVec<double>::operator-=(const double &a)
+{
+	copyonwrite();
+#ifdef CUDALA
+	if(location==cpu)
+#endif
+		cblas_daxpy(nn, -1.0, &a, 0, v, 1);
+#ifdef CUDALA
+	else
+		{
+		double *d=gpuputdouble(a);
+		cublasDaxpy(nn, -1.0, d, 0, v, 1);
+		gpufree(d);
+		}
+#endif
+	return *this;
+}
+
+template<>
+inline
+NRVec< complex<double> > &
+NRVec< complex<double> >::operator-=(const complex<double> &a)
+{
+	copyonwrite();
+#ifdef CUDALA
+        if(location==cpu)
+#endif
+		cblas_zaxpy(nn, &CMONE, &a, 0, v, 1);
+#ifdef CUDALA
+        else
+                {
+                complex<double> *d=gpuputcomplex(a);
+                cublasZaxpy(nn, CUMONE, (cuDoubleComplex *)d, 0, (cuDoubleComplex *)v, 1);
+                gpufree(d);
+                }
+#endif
+	return *this;
+}
+
+
+template<>
+inline
+NRVec<double> & NRVec<double>::operator+=(const NRVec<double> &rhs)
+{
+#ifdef DEBUG
+        if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
+#endif
+        copyonwrite();
+        cblas_daxpy(nn, 1.0, rhs.v, 1, v, 1);
+        return *this;
+}
+template<>
+inline
+NRVec< complex<double> > &
+NRVec< complex<double> >::operator+=(const NRVec< complex<double> > &rhs)
+{
+#ifdef DEBUG
+        if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
+#endif
+        copyonwrite();
+        cblas_zaxpy(nn, &CONE, rhs.v, 1, v, 1);
+        return *this;
+}
+
+
+template<>
+inline
+NRVec<double> & NRVec<double>::operator-=(const NRVec<double> &rhs)
+{
+#ifdef DEBUG
+        if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
+#endif
+SAME_LOC(*this,rhs);
+        copyonwrite();
+#ifdef CUDALA
+	if(location==cpu)
+#endif
+        	cblas_daxpy(nn, -1.0, rhs.v, 1, v, 1);
+#ifdef CUDALA
+	else
+		cublasDaxpy(nn, -1.0, rhs.v, 1, v, 1);
+#endif
+        return *this;
+}
+
+template<>
+inline
+NRVec< complex<double> > &
+NRVec< complex<double> >::operator-=(const NRVec< complex<double> > &rhs)
+{
+#ifdef DEBUG
+        if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
+#endif
+        copyonwrite();
+        cblas_zaxpy(nn, &CMONE, rhs.v, 1, v, 1);
+        return *this;
+}
+
+template<>
+inline
+NRVec<double> & NRVec<double>::operator*=(const double &a)
+{
+        copyonwrite();
+        cblas_dscal(nn, a, v, 1);
+        return *this;
+}
+
+template<>
+inline
+NRVec< complex<double> > &
+NRVec< complex<double> >::operator*=(const complex<double> &a)
+{
+        copyonwrite();
+        cblas_zscal(nn, &a, v, 1);
+        return *this;
+}
+
+
+template<>
+inline
+const double NRVec<double>::operator*(const NRVec<double> &rhs) const
+{
+#ifdef DEBUG
+        if (nn != rhs.nn) laerror("dot of incompatible vectors");
+#endif
+        return cblas_ddot(nn, v, 1, rhs.v, 1);
+}
+
+
+template<>
+inline
+const complex<double>
+NRVec< complex<double> >::operator*(const NRVec< complex<double> > &rhs) const
+{
+#ifdef DEBUG
+        if (nn != rhs.nn) laerror("dot of incompatible vectors");
+#endif
+        complex<double> dot;
+        cblas_zdotc_sub(nn, v, 1, rhs.v, 1, &dot);
+        return dot;
+}
+
+// Sum of elements
+template<>
+inline
+const double NRVec<double>::asum() const
+{
+        return cblas_dasum(nn, v, 1);
+}
+
+
+// Dot product: x * y
+template<>
+inline
+const double NRVec<double>::dot(const double *y, const int stride) const
+{
+        return cblas_ddot(nn, y, stride, v, 1);
+}
+
+template<>
+inline
+const complex<double>
+NRVec< complex<double> >::dot(const complex<double> *y, const int stride) const
+{
+        complex<double> dot;
+        cblas_zdotc_sub(nn, y, stride, v, 1, &dot);
+        return dot;
+}
+
+// return norm of the Vec
+template<>
+inline
+const double  NRVec<double>::norm() const
+{
+#ifdef CUDALA
+	if(location!=cpu) return cublasDnrm2(nn, v, 1);
+#endif
+        return cblas_dnrm2(nn, v, 1);
+}
+
+template<>
+inline
+const double NRVec< complex<double> >::norm() const
+{
+        return cblas_dznrm2(nn, v, 1);
+}
+
+// Max element of the array
+template<>
+inline
+const double  NRVec<double>::amax() const
+{
+        return v[cblas_idamax(nn, v, 1)];
+}
+
+/*
+cblas_izamax seems to be missing at least in some cblas versions
+template<>
+inline
+const complex<double> NRVec< complex<double> >::amax() const
+{
+        return v[cblas_izamax(nn, v, 1)];
+}
+*/
+
+
+
 }//namespace

 #endif /* _LA_VEC_H_ */