*** empty log message ***

2010-06-25 15:28:19 +00:00
parent eb0aaf9adf
commit 074c943862
13 changed files with 1938 additions and 464 deletions
@@ -1,2 +1,70 @@
 #include "la_traits.h"
 #include "cuda_la.h"
 #ifdef CUDALA
 namespace LA {
 GPUID DEFAULT_LOC = cpu;
 void set_default_loc(const GPUID loc) 
 {
 DEFAULT_LOC = loc;
 }
 void *gpualloc(size_t size)
 {
 cublasStatus status;
 void *ptr=NULL;
 status = cublasAlloc(size,1,&ptr);
 if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc");
 return ptr;
 }
 void gpufree(void *ptr)
 {
 cublasStatus status = cublasFree(ptr);
 if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasFree");
 }
 void gpuget(size_t n, size_t elsize, const void *from, void *to)
 {
 cublasStatus status;
 status=cublasGetVector(n,elsize,from,1,to,1);
 if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasGetVector");
 }
 void gpuput(size_t n, size_t elsize, const void *from, void *to)
 {
 cublasStatus status;
 status=cublasSetVector(n,elsize,from,1,to,1);
 if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector");
 }
 double *gpuputdouble(const double &x)
 {
 cublasStatus status;
 void *ptr=NULL;
 status = cublasAlloc(1,sizeof(double),&ptr);
 if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc");
 status=cublasSetVector(1,sizeof(double),&x,1,ptr,1);
 if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector");
 return (double *)ptr;
 }
 complex<double> *gpuputcomplex(const complex<double> &x)
 {
 cublasStatus status;
 void *ptr=NULL;
 status = cublasAlloc(1,sizeof(complex<double>),&ptr);
 if(status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasAlloc");
 status=cublasSetVector(1,sizeof(complex<double>),&x,1,ptr,1);
 if (status != CUBLAS_STATUS_SUCCESS) laerror("Error in cublasSetVector");
 return (complex<double> *)ptr;
 }
 }
 #endif
@@ -0,0 +1,56 @@
 #ifndef _CUDA_LA_H
 #define _CUDA_LA_H
 #ifdef CUDALA
 #undef MATPTR
 #include "cublas.h"
 #endif
 #include "la_traits.h"
 namespace LA {
 #ifdef CUDALA
 #define NOT_GPU(x) {if((x).getlocation()!=cpu) laerror("Operation not implemented on GPU (yet). Use .moveto(0) first.");}
 #define SAME_LOC(x,y) {if((x).getlocation()!=(y).getlocation()) laerror("Operands have different location. Use .moveto() first.");}
 #define SAME_LOC3(x,y,z) {if((x).getlocation()!=(y).getlocation() || (x).getlocation()!=(z).getlocation()) laerror("Operands have different location. Use .moveto() first.");}
 #else
 #define NOT_GPU(x) {}
 #define SAME_LOC(x,y) {}
 #define SAME_LOC3(x,y,z) {}
 #endif
 typedef enum {undefined=-1, cpu=0, gpu1=1, gpu2=2, gpu3=3, gpu4=4} GPUID;
 #ifdef CUDALA
 //global static instantiation of this class will provide automatic init/shutdown of GPU
 class GPU_START {
 public:
 	GPU_START(void)
 		{
 		cublasStatus status = cublasInit();
 		if (status != CUBLAS_STATUS_SUCCESS) laerror("Cannot init GPU for CUBLAS");
 		}
 	~GPU_START(void)
 		{
 		cublasStatus status = cublasShutdown();
 		if (status != CUBLAS_STATUS_SUCCESS) laerror("Cannot cleanly shutdown GPU");
                }
 };
 extern void *gpualloc(size_t size);
 extern void gpufree(void *ptr);
 extern void gpuget(size_t n, size_t elsize, const void *from, void *to);
 extern void gpuput(size_t n, size_t elsize, const void *from, void *to);
 extern double *gpuputdouble(const double &x);
 extern complex<double> *gpuputcomplex(const complex<double> &x);
 void set_default_loc(const GPUID loc);
 extern GPUID DEFAULT_LOC;
 #endif
 }
 #endif
@@ -40,6 +40,8 @@
 #include "laerror.h"
 #include "cuda_la.h"
 #ifdef NONCBLAS
 #include "noncblas.h"
 #else
@@ -205,6 +207,8 @@ struct LA_traits_aux<complex<C>, scalar_true> {
 typedef complex<C> elementtype;
 typedef complex<C> producttype;
 typedef C normtype;
 typedef C realtype;
 typedef complex<C> complextype;
 static inline C sqrabs(const complex<C> x) { return x.real()*x.real()+x.imag()*x.imag();}
 static inline bool gencmp(const complex<C> *x, const complex<C> *y, int n) {return memcmp(x,y,n*sizeof(complex<C>));}
 static bool bigger(const  complex<C> &x, const complex<C> &y) {laerror("complex comparison undefined"); return false;}
@@ -230,6 +234,8 @@ struct LA_traits_aux<C, scalar_true> {
 typedef C elementtype;
 typedef C producttype;
 typedef C normtype;
 typedef C realtype;
 typedef complex<C> complextype;
 static inline C sqrabs(const C x) { return x*x;}
 static inline bool gencmp(const C *x, const C *y, int n) {return memcmp(x,y,n*sizeof(C));}
 static inline bool bigger(const  C &x, const C &y) {return x>y;}
@@ -261,6 +267,8 @@ struct LA_traits_aux<X<C>, scalar_false> { \
 typedef C elementtype; \
 typedef X<C> producttype; \
 typedef typename LA_traits<C>::normtype normtype; \
 typedef X<typename LA_traits<C>::realtype> realtype; \
 typedef X<typename LA_traits<C>::complextype> complextype; \
 static bool gencmp(const C *x, const C *y, int n) {for(int i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \
 static inline bool bigger(const  C &x, const C &y) {return x>y;} \
 static inline bool smaller(const  C &x, const C &y) {return x<y;} \
@@ -293,6 +301,8 @@ struct LA_traits_aux<X<C>, scalar_false> {  \
 typedef C elementtype;  \
 typedef NRMat<C> producttype;  \
 typedef typename LA_traits<C>::normtype normtype;  \
 typedef X<typename LA_traits<C>::realtype> realtype; \
 typedef X<typename LA_traits<C>::complextype> complextype; \
 static bool gencmp(const C *x, const C *y, int n) {for(int i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \
 static inline bool bigger(const  C &x, const C &y) {return x>y;} \
 static inline bool smaller(const  C &x, const C &y) {return x<y;} \
@@ -25,6 +25,7 @@
 #include <errno.h>
 #include <stdarg.h>
 #include "cuda_la.h"
 #ifdef USE_TRACEBACK
 #include "traceback.h"
@@ -32,6 +33,11 @@
 namespace LA {
 //enforce GPU initialization by a global class instantization constructor
 #ifdef CUDALA
 GPU_START gpu_start_instant;
 #endif
 bool _LA_count_check=true;
 extern "C" void _findme(void) {}; //for autoconf test we need a function with C linkage
@@ -45,6 +51,13 @@ void laerror(const char *s1)
    std::cerr << s1  << "\n";
    std::cout << s1  << "\n";
  }
 #ifdef CUDALA
 {
 cublasStatus s=cublasGetError();
 std::cerr << "CUBLAS status = "<<s<<std::endl;
 std::cout << "CUBLAS status = "<<s<<std::endl;
 }
 #endif
  if(errno) perror("system error");
 throw LAerror(s1); 
@@ -122,6 +122,15 @@ return r;
 template <typename T>
 void NRMat<T>::put(int fd, bool dim, bool transp) const
 {
 #ifdef CUDALA
 if(location!=cpu)
 	{
 	NRMat<T> tmp= *this;
 	tmp.moveto(cpu);
 	tmp.put(fd,dim,transp);
 	return;
 	}
 #endif
 errno=0;
 if(dim)
 {
@@ -153,6 +162,17 @@ else LA_traits<T>::multiput(nn*mm,fd,
 template <typename T>
 void NRMat<T>::get(int fd, bool dim, bool transp)
 {
 #ifdef CUDALA
 if(location!=cpu)
        {
        NRMat<T> tmp;
 	tmp.moveto(cpu);
        tmp.get(fd,dim,transp);
        tmp.moveto(location);
 	*this = tmp;
        return;
        }
 #endif
 int nn0,mm0;
 errno=0;
 if(dim)
@@ -188,6 +208,43 @@ else LA_traits<T>::multiget(nn*mm,fd,
 // Assign diagonal
 template <>
 NRMat<double> & NRMat<double>::operator=(const double &a)
 {
        copyonwrite();
 #ifdef DEBUG
        if (nn != mm) laerror("RMat.operator=scalar on non-square matrix");
 #endif
 #ifdef CUDALA
        if(location==cpu)
 	{
 #endif
 #ifdef MATPTR
         memset(v[0],0,nn*nn*sizeof(double));
         for (int i=0; i< nn; i++) v[i][i] = a;
 #else
 	 double n=0.;
 	 cblas_dcopy(nn*nn, &n, 0, v, 1);
 	 cblas_dcopy(nn, &a, 0, v, nn+1);
 #endif
 #ifdef CUDALA
 	}
 	else
 	{
 	double *d=gpuputdouble(0.);
        cublasDcopy(nn*nn, d, 0, v, 1);
 	gpufree(d);
 	d=gpuputdouble(a);
        cublasDcopy(nn, d, 0, v, nn+1);
 	gpufree(d);
 	}
 #endif
         return *this;
 }
 template <typename T>
 NRMat<T> & NRMat<T>::operator=(const T &a)
 {
@@ -206,6 +263,64 @@ NRMat<T> & NRMat<T>::operator=(const T &a)
 }
 template <>
 NRMat<double> & NRMat<double>::operator+=(const double&a)
 {
        copyonwrite();
 #ifdef DEBUG
        if (nn != mm) laerror("Mat.operator+=scalar on non-square matrix");
 #endif
 #ifdef CUDALA
 	if(location==cpu)
 	{
 #endif
 #ifdef MATPTR
        for (int i=0; i< nn; i++) v[i][i] += a;
 #else
 	cblas_daxpy(nn, 1.0, &a, 0, *this, nn+1);
 #endif
 #ifdef CUDALA
 	}
 	else
 	{
 	double *d=gpuputdouble(a);
 	cublasDaxpy(nn, 1.0, d, 0, *this, nn+1);
 	gpufree(d);
 	}
 #endif
        return *this;
 }
 template <>
 NRMat<double> & NRMat<double>::operator-=(const double&a)
 {
        copyonwrite();
 #ifdef DEBUG
        if (nn != mm) laerror("Mat.operator+=scalar on non-square matrix");
 #endif
 #ifdef CUDALA
 	if(location==cpu)
 	{
 #endif
 #ifdef MATPTR
        for (int i=0; i< nn; i++) v[i][i] -= a;
 #else
 	cblas_daxpy(nn, -1.0, &a, 0, *this, nn+1);
 #endif
 #ifdef CUDALA
 	}
 	else
 	{
        double *d=gpuputdouble(a);
        cublasDaxpy(nn, -1.0, d, 0, *this, nn+1);
        gpufree(d);
 	}
 #endif
        return *this;
 }
 // M += a
@@ -240,6 +355,31 @@ NRMat<T> & NRMat<T>::operator-=(const T &a)
 	return *this;
 }
 template <>
 const NRMat<double> NRMat<double>::operator-() const
 {
        NRMat<double> result(nn, mm);
 #ifdef CUDALA
        if(location==cpu)
 	{
 #endif
 #ifdef MATPTR
        for (int i=0; i<nn*mm; i++) result.v[0][i]= -v[0][i];
 #else
 	cblas_dscal(nn*mm, -1., v, 1);
 #endif
 #ifdef CUDALA
        }
 	else
 	{
 	cublasDscal(nn*mm, -1., v, 1);
 	}
 #endif
        return result;
 }
 // unary minus
 template <typename T>
 const NRMat<T> NRMat<T>::operator-() const
@@ -253,6 +393,7 @@ const NRMat<T> NRMat<T>::operator-() const
 	return result;
 }
 // direct sum
 template <typename T>
 const NRMat<T> NRMat<T>::operator&(const NRMat<T> & b) const
@@ -540,7 +681,13 @@ template<>
 NRMat<double> & NRMat<double>::operator*=(const double &a)
 {
 	copyonwrite();
-	cblas_dscal(nn*mm, a, *this, 1);
+#ifdef CUDALA
 	if(location==cpu)
 #endif
 		cblas_dscal(nn*mm, a, *this, 1);
 #ifdef CUDALA
 	else 	cublasDscal(nn*mm, a, v, 1);
 #endif
 	return *this;
 }
@@ -559,6 +706,7 @@ NRMat< complex<double> >::operator*=(const complex<double> &a)
 template <typename T>
 NRMat<T> & NRMat<T>::operator*=(const T &a)
 {
 NOT_GPU(*this);
        copyonwrite();
 #ifdef MATPTR
         for (int i=0; i< nn*mm; i++) v[0][i] *= a;
@@ -578,8 +726,16 @@ NRMat<double> & NRMat<double>::operator+=(const NRMat<double>  &rhs)
 	if (nn != rhs.nn || mm!= rhs.mm) 
 		laerror("Mat += Mat of incompatible matrices");
 #endif
 SAME_LOC(*this,rhs);
 	copyonwrite();
 #ifdef CUDALA
 	if(location==cpu)
 #endif
 	cblas_daxpy(nn*mm, 1.0, rhs, 1, *this, 1);
 #ifdef CUDALA
 	else 
 	cublasDaxpy(nn*mm, 1.0, rhs, 1, v, 1);
 #endif
 	return *this;
 }
@@ -625,8 +781,16 @@ NRMat<double> & NRMat<double>::operator-=(const NRMat<double>  &rhs)
 	if (nn != rhs.nn || mm!= rhs.mm) 
 		laerror("Mat -= Mat of incompatible matrices");
 #endif
 	SAME_LOC(*this,rhs);
 	copyonwrite();
 #ifdef CUDALA
 	if(location==cpu)
 #endif
 	cblas_daxpy(nn*mm, -1.0, rhs, 1, *this, 1);
 #ifdef CUDALA
 	else
 	cublasDaxpy(nn*mm, -1.0, rhs, 1, v, 1);
 #endif
 	return *this;
 }
@@ -836,9 +1000,18 @@ const NRMat<double> NRMat<double>::operator*(const NRMat<double> &rhs) const
 	if (mm != rhs.nn) laerror("product of incompatible matrices");
 	if (rhs.mm <=0) laerror("illegal matrix dimension in gemm");
 #endif
-	NRMat<double> result(nn, rhs.mm);
+SAME_LOC(*this,rhs);
 	NRMat<double> result(nn, rhs.mm,rhs.getlocation());
 #ifdef CUDALA
        if(location==cpu)
 #endif
 	cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, nn, rhs.mm, mm, 1.0,
 			*this, mm, rhs, rhs.mm, 0.0, result, rhs.mm);
 #ifdef CUDALA
        else
 	cublasDgemm('N','N',rhs.mm,nn,mm,1.0,rhs, rhs.mm,*this, mm, 0.0, result, rhs.mm);
 #endif
 	return result;
 }
@@ -991,12 +1164,21 @@ void NRMat<double>::gemm(const double &beta, const NRMat<double> &a,
 	if (l!=nn || ll!=mm || k!=kk) laerror("incompatible matrices in Mat:gemm()");
 	if(b.mm <=0 || mm<=0) laerror("illegal matrix dimension in gemm");
 #endif
 SAME_LOC3(*this,a,b);
 	if (alpha==0.0 && beta==1.0) return;
 	copyonwrite();
 #ifdef CUDALA
 	if(location==cpu)
 #endif
 	cblas_dgemm(CblasRowMajor, (transa=='n' ? CblasNoTrans : CblasTrans),
 			(transb=='n' ? CblasNoTrans : CblasTrans), nn, mm, k, alpha, a,
 			a.mm, b , b.mm, beta, *this , mm);
 #ifdef CUDALA
        else
 	cublasDgemm(transb,transa,mm,nn,k,alpha, b , b.mm, a,a.mm, beta, *this , mm);
 #endif
 }
@@ -1028,7 +1210,20 @@ void NRMat< complex<double> >::gemm(const complex<double> & beta,
 template<>
 const double  NRMat<double>::norm(const double scalar) const
 {
-	if (!scalar) return cblas_dnrm2(nn*mm, (*this)[0], 1);
+	if (!scalar)  
 		{
 #ifdef CUDALA
 		if(location==cpu)
 #endif
 		return cblas_dnrm2(nn*mm, (*this)[0], 1);
 #ifdef CUDALA
 		else
 		return cublasDnrm2(nn*mm, v, 1);
 #endif
 		}
 NOT_GPU(*this);
 	double sum = 0;
 	for (int i=0; i<nn; i++)
 		for (int j=0; j<mm; j++) {
@@ -1246,6 +1441,127 @@ else //vectors are columns
 //------------------------------------------------------------------------------
 //	for a matrix A(1:nn,1:mm) performs Fortran-like 
 //	operation A(nn:-1:1,:) 
 //------------------------------------------------------------------------------
 template<>
 NRMat<double>& NRMat<double>::SwapRows(){
        copyonwrite();
        const int n_pul = this->nn / 2;
        double * const dataIn  = this->v;
        for(register int i=0; i<n_pul; i++){
                cblas_dswap(mm, dataIn + i*mm, 1, dataIn + (nn-i-1)*mm, 1);
        }
        return *this;
 }
 //------------------------------------------------------------------------------
 template<>
 NRMat<complex<double> >& NRMat<complex<double> >::SwapRows(){
        copyonwrite();
 	const int n = this->nn;
 	const int m = this->mm;
        const int n_pul = this->nn / 2;
        complex<double> * const dataIn  = this->v;
        for(register int i=0; i<n_pul; i++){
                cblas_zswap(m, dataIn + i*m, 1, dataIn + (n-i-1)*m, 1);
        }
        return *this;
 }
 //------------------------------------------------------------------------------
 template<typename T>
 NRMat<T>& NRMat<T>::SwapRows(){
        copyonwrite();
 	const int n = this->nn;
 	const int m = this->mm;
        const int n_pul = this->nn / 2;
        T * const dataIn  = this->v;
        for(register int i=0; i<n_pul; i++){
 		const int offset1 = i*m;
 		const int offset2 = (n-i-1)*m;
 		for(register int j=0;j<m;j++){
 			dataIn[offset1 + j] = dataIn[offset2 + j];
 		}
        }
        return *this;
 }
 //------------------------------------------------------------------------------
 //	for a matrix A(1:nn,1:mm) performs Fortran-like 
 //	operation A(:,mm:-1:1) 
 //------------------------------------------------------------------------------
 template<>
 NRMat<double>& NRMat<double>::SwapCols(){
        copyonwrite();
 	const int n = this->nn;
 	const int m = this->mm;
        const int m_pul = m / 2;
        double * const dataIn  = this->v;
        for(register int i=0; i<m_pul; i++){
                cblas_dswap(n, dataIn + i, m, dataIn + (m-i-1), m);
        }
        return *this;
 }
 //------------------------------------------------------------------------------
 template<>
 NRMat<complex<double> >& NRMat<complex<double> >::SwapCols(){
        copyonwrite();
        const int n_pul = this->nn / 2;
        const int m_pul = this->mm / 2;
        complex<double> * const dataIn  = this->v;
        for(register int i=0; i<m_pul; i++){
                cblas_zswap(nn, dataIn + i, mm, dataIn + (mm-i-1), mm);
        }
        return *this;
 }
 //------------------------------------------------------------------------------
 template<typename T>
 NRMat<T>& NRMat<T>::SwapCols(){
        copyonwrite();
        const int n_pul = nn / 2;
        const int m_pul = mm / 2;
        T * const dataIn  = this->v;
        for(register int i=0; i<m_pul; i++){
 		for(register int j=0;j<nn;j++){
 			const int jm = j*mm;
 			dataIn[i + jm] = dataIn[(mm-i-1) + jm];
 		}
        }
        return *this;
 }
 //------------------------------------------------------------------------------
 //	for a matrix A(1:nn,1:mm) performs Fortran-like 
 //	operation A(nn:-1:1,mm:-1:1)
 //------------------------------------------------------------------------------
 template<typename T>
 NRMat<T>& NRMat<T>::SwapRowsCols(){
        this->copyonwrite();
        const int n = this->nn;
        const int m = this->mm;
        T * const dataIn  = this->v;
        T * const dataOut = this->v;
 	const int Dim = n*m;
        for(register int i=0;i<n;i++){
                const int off = i*n;
                for(register int j=0;j<m;j++){
                        const int offset = off + j;
                        dataOut[Dim - (offset + 1)] = dataIn[offset];
                }
        }
        return *this;
 }
 //------------------------------------------------------------------------------
 //////////////////////////////////////////////////////////////////////////////
@@ -33,12 +33,20 @@ protected:
 	T *v;
 #endif
 	int *count;
 #ifdef CUDALA
        GPUID location;
 #endif
 public:
 	friend class NRVec<T>;
 	friend class NRSMat<T>;
-	inline NRMat() : nn(0), mm(0), v(0), count(0) {};
+	inline NRMat() : nn(0), mm(0), v(0), count(0) 
-	inline NRMat(const int n, const int m);
+			{
 #ifdef CUDALA
 			location = DEFAULT_LOC;
 #endif
 			};
 	inline NRMat(const int n, const int m ,const GPUID loc= undefined);
 	inline NRMat(const T &a, const int n, const int m);
 	NRMat(const T *a, const int n, const int m);
 	inline NRMat(const NRMat &rhs);
@@ -57,6 +65,13 @@ public:
 #endif
 	const bool operator==(const NRMat &rhs) const {return !(*this != rhs);};
 	inline int getcount() const {return count?*count:0;}
 #ifdef CUDALA
 	inline GPUID getlocation() const {return location;}
 	void moveto(const GPUID dest);
 #else
 	inline GPUID getlocation() const {return cpu;}
 	void moveto(const GPUID dest) {};
 #endif
 	NRMat & operator=(const NRMat &rhs);  //assignment
 	void randomize(const typename  LA_traits<T>::normtype &x); //fill with random numbers
 	NRMat & operator=(const T &a);    //assign a to diagonal
@@ -88,7 +103,7 @@ public:
 	const NRMat operator*(const NRSMat<T> &rhs) const; // Mat * Smat
 	const NRMat operator&(const NRMat &rhs) const; // direct sum
 	const NRMat operator|(const NRMat<T> &rhs) const; // direct product
-	const NRVec<T> operator*(const NRVec<T> &rhs) const {NRVec<T> result(nn); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
+	const NRVec<T> operator*(const NRVec<T> &rhs) const {NRVec<T> result(nn,rhs.getlocation()); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
 	const NRVec<complex<T> > operator*(const NRVec<complex<T> > &rhs) const {NRVec<complex<T> > result(nn); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
 	const NRVec<T> rsum() const; //sum of rows
 	const NRVec<T> csum() const; //sum of columns
@@ -157,9 +172,14 @@ public:
 namespace LA {
 // ctors
 template <typename T>
-NRMat<T>::NRMat(const int n, const int m) : nn(n), mm(m), count(new int)
+NRMat<T>::NRMat(const int n, const int m, const GPUID loc) : nn(n), mm(m), count(new int)
 {
 	*count = 1;
 #ifdef CUDALA
        location= (loc==undefined?DEFAULT_LOC:loc);
        if(location==cpu)
 	{
 #endif
 #ifdef MATPTR
 	v = new T*[n];
 	v[0] = new T[m*n];
@@ -167,14 +187,29 @@ NRMat<T>::NRMat(const int n, const int m) : nn(n), mm(m), count(new int)
 #else
 	v = new T[m*n];
 #endif
 #ifdef CUDALA
 	}
        else
 	{
 	v= (T*) gpualloc(n*m*sizeof(T));
 	}
 #endif
 }
 template <typename T>
 NRMat<T>::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new int)
 {
 #ifdef CUDALA
        location=DEFAULT_LOC;
 #endif
 	int i;
 	T *p;
 	*count = 1;
 #ifdef CUDALA
        if(location==cpu)
 	{
 #endif
 #ifdef MATPTR
 	v = new T*[n];
 	p = v[0] = new T[m*n];
@@ -186,12 +221,29 @@ NRMat<T>::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new
 		for (i=0; i< n*m; i++) *p++ = a;
 	else
 		memset(p, 0, n*m*sizeof(T));
 #ifdef CUDALA
 	}
 	else
 	{
 	v= (T*) gpualloc(n*m*sizeof(T));
 	cublasSetVector(n*m,sizeof(T),&a,0,v,1);
 	}
 #endif
 }
 template <typename T>
 NRMat<T>::NRMat(const T *a, const int n, const int m) : nn(n), mm(m), count(new int)
 {
 #ifdef CUDALA
        location=DEFAULT_LOC;
 #endif
 	*count = 1;
 #ifdef CUDALA
        if(location==cpu)
        {
 #endif
 #ifdef MATPTR
 	v = new T*[n];
 	v[0] = new T[m*n];
@@ -201,11 +253,25 @@ NRMat<T>::NRMat(const T *a, const int n, const int m) : nn(n), mm(m), count(new
 	v = new T[m*n];
 	memcpy(v, a, n*m*sizeof(T));
 #endif
 #ifdef CUDALA
        }
        else
        {
        v= (T*) gpualloc(n*m*sizeof(T));
        cublasSetVector(n*m,sizeof(T),a,1,v,1);
        }
 #endif
 }
 //copy constructor
 template <typename T>
 NRMat<T>::NRMat(const NRMat &rhs)
 {
 #ifdef CUDALA
        location=rhs.location;
 #endif
 	nn = rhs.nn;
 	mm = rhs.mm;
 	count = rhs.count;
@@ -213,9 +279,16 @@ NRMat<T>::NRMat(const NRMat &rhs)
 	if (count) ++(*count);
 }
 template <typename T>
 NRMat<T>::NRMat(const NRSMat<T> &rhs)
 {
 NOT_GPU(rhs); 
 #ifdef CUDALA
 	location=rhs.location;
 #endif
 	int i;
 	nn = mm = rhs.nrows();
 	count = new int;
@@ -244,6 +317,10 @@ NRMat<T>::NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset)
 {
 	if (offset < 0 || n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");
 #ifdef CUDALA
 location=rhs.location;
 #endif
 	nn = n;
 	mm = m;
 	count = rhs.count;
@@ -303,6 +380,7 @@ inline T & NRMat<T>::operator()(const int i, const int j)
 	if (_LA_count_check && *count != 1) laerror("Mat lval use of (,) with count > 1");
 	if (i<0 || i>=nn &&nn>0 || j<0 || j>=mm && mm>0) laerror("Mat (,) out of range");
 	if (!v) laerror("(,) for unallocated Mat");
 NOT_GPU(*this);
 #endif
 #ifdef MATPTR
 	return v[i][j];
@@ -310,12 +388,14 @@ inline T & NRMat<T>::operator()(const int i, const int j)
 	return v[i*mm+j];
 #endif
 }
 template <typename T>
 inline const T & NRMat<T>::operator()(const int i, const int j) const
 {
 #ifdef DEBUG
 	if (i<0 || i>=nn&&nn>0 || j<0 || j>=mm&& mm>0) laerror("Mat (,) out of range");
 	if (!v) laerror("(,) for unallocated Mat");
 NOT_GPU(*this); //in principle we could copy the element to CPU memory, yielding, however, a highly inneficient contruct 
 #endif
 #ifdef MATPTR
 	return v[i][j];
@@ -391,7 +471,7 @@ inline const complex<double>  NRMat< complex<double> >::amax() const
 }
-//basi stuff to be available for any type ... must be in .h
+//basic stuff to be available for any type ... must be in .h
 // dtor
 template <typename T>
 NRMat<T>::~NRMat()
@@ -399,10 +479,21 @@ NRMat<T>::~NRMat()
        if (!count) return;
        if (--(*count) <= 0) {
                if (v) {
 #ifdef CUDALA
 		    if(location==cpu)
 #endif
 		    	{
 #ifdef MATPTR
                        delete[] (v[0]);
 #endif
                        delete[] v;
 		    	}
 #ifdef CUDALA
 		    else 
 			{
 			gpufree(v);
 			}
 #endif
                }
                delete count;
        }
@@ -415,14 +506,27 @@ NRMat<T> & NRMat<T>::operator=(const NRMat<T> &rhs)
        if (this !=&rhs)
 		{
        	if (count) 
-                    if (--(*count) ==0 ) {
+                    if (--(*count) ==0 ) 
 			{
 #ifdef CUDALA
 			if(location==cpu) 
 			{
 #endif
 #ifdef MATPTR
                        delete[] (v[0]);
 #endif
                        delete[] v;
 #ifdef CUDALA
 			}
 			else gpufree(v);
 #endif
                        delete count;
                	}
                v = rhs.v;
 #ifdef CUDALA
                location=rhs.location;
 #endif
                nn = rhs.nn;
                mm = rhs.mm;
                count = rhs.count;
@@ -437,46 +541,8 @@ template <typename T>
 NRMat<T> & NRMat<T>::operator|=(const NRMat<T> &rhs)
 {
        if (this == &rhs) return *this;
-#ifdef DEBUG
+	*this = rhs;
-        if (!rhs.v) laerror("unallocated rhs in Mat operator |=");
+	this->copyonwrite();
 #endif
        if (count)
                if (*count > 1) {
                        --(*count);
                        nn = 0;
                        mm = 0;
                        count = 0;
                        v = 0;
                }
        if (nn != rhs.nn || mm != rhs.mm) {
                if (v) {
 #ifdef MATPTR
                        delete[] (v[0]);
 #endif
                        delete[] (v);
                        v = 0;
                }
                nn = rhs.nn;
                mm = rhs.mm;
        }
        if (!v) {
 #ifdef MATPTR
                v = new T*[nn];
                v[0] = new T[mm*nn];
 #else
                v = new T[mm*nn];
 #endif
        }
 #ifdef MATPTR
        for (int i=1; i< nn; i++) v[i] = v[i-1] + mm;
        memcpy(v[0], rhs.v[0], nn*mm*sizeof(T));
 #else
        memcpy(v, rhs.v, nn*mm*sizeof(T));
 #endif
        if (!count) count = new int;
        *count = 1;
        return *this;
 }
@@ -486,9 +552,13 @@ void NRMat<T>::copyonwrite()
 {
        if (!count) laerror("Mat::copyonwrite of undefined matrix");
        if (*count > 1) {
-                (*count)--;
+             (*count)--;
-                count = new int;
+             count = new int;
-                *count = 1;
+             *count = 1;
 #ifdef CUDALA
 	     if(location==cpu) //matrix is in CPU memory
 	     {
 #endif
 #ifdef MATPTR
                T **newv = new T*[nn];
                newv[0] = new T[mm*nn];
@@ -499,10 +569,21 @@ void NRMat<T>::copyonwrite()
                T *newv = new T[mm*nn];
                memcpy(newv, v, mm*nn*sizeof(T));
                v = newv;
 #endif
 #ifdef CUDALA
 	     }
          else //matrix is in GPU memory
 		{
 		T *newv = (T *) gpualloc(mm*nn*sizeof(T));
 		if(sizeof(T)%sizeof(float)!=0) laerror("cpu memcpy alignment problem");
                cublasScopy(nn*mm*sizeof(T)/sizeof(float),(const float *) v,1,(float *)newv,1);
                v = newv;
 		}
 #endif
        }
 }
 template <typename T>
 void NRMat<T>::resize(int n, int m)
 {
@@ -519,10 +600,18 @@ if(m==0) n=0;
 	        if(n==0 && m==0)
        		{
 		        if(--(*count) <= 0) {
 #ifdef CUDALA
                           if(location==cpu)
 				{
 #endif
 #ifdef MATPTR
                		if(v) delete[] (v[0]);
 #endif
                		if(v) delete[] v;
 #ifdef CUDALA
 				}
 			    else gpufree(v);
 #endif
 		                delete count;
 		                }
 		        count=0;
@@ -543,6 +632,10 @@ if(m==0) n=0;
                *count = 1;
                nn = n;
                mm = m;
 #ifdef CUDALA
                if(location==cpu) 
 		{
 #endif
 #ifdef MATPTR
                v = new T*[nn];
                v[0] = new T[m*n];
@@ -550,12 +643,22 @@ if(m==0) n=0;
 #else
                v = new T[m*n];
 #endif
 #ifdef CUDALA
                }
 		else
 		v = (T *) gpualloc(n*m*sizeof(T));
 #endif
                return;
        }
        // At this point *count = 1, check if resize is necessary
        if (n!=nn || m!=mm) {
                nn = n;
                mm = m;
 #ifdef CUDALA
             if(location==cpu)
 	     {
 #endif
 #ifdef MATPTR
                delete[] (v[0]);
 #endif
@@ -566,6 +669,14 @@ if(m==0) n=0;
                for (int i=1; i< n; i++) v[i] = v[i-1] + m;
 #else
                v = new T[m*n];
 #endif
 #ifdef CUDALA
             }
            else
 	     {
             gpufree(v);
             v=(T *) gpualloc(n*m*sizeof(T));
 	     }
 #endif
        }
 }
@@ -587,7 +698,11 @@ return r;
 // I/O
 template <typename T>
 std::ostream& operator<<(std::ostream &s, const NRMat<T> &x)
-                {
+{
 #ifdef CUDALA
        if(x.getlocation()==cpu)
 		{
 #endif
                int i,j,n,m;
                n=x.nrows();
                m=x.ncols();
@@ -597,18 +712,43 @@ std::ostream& operator<<(std::ostream &s, const NRMat<T> &x)
                        for(j=0; j<m;j++) s << (typename LA_traits_io<T>::IOtype) x[i][j] << (j==m-1 ? '\n' : ' '); // endl cannot be used in the conditional expression, since it is an overloaded function
                        }
                return s;
-                }
+#ifdef CUDALA
 		}
 	else
 		{
 		NRMat<T> tmp=x;
 		tmp.moveto(cpu);
 		return s<<tmp;
 		}
 #endif
 }
 template <typename T>
 std::istream& operator>>(std::istream  &s, NRMat<T> &x)
 {
 #ifdef CUDALA
        if(x.getlocation()==cpu)
                {
 #endif
                int i,j,n,m;
                s >> n >> m;
                x.resize(n,m);
 		typename LA_traits_io<T>::IOtype tmp;
                for(i=0;i<n;i++) for(j=0; j<m;j++) { s>>tmp; x[i][j]=tmp;}
                return s;
-                }
+#ifdef CUDALA
 		}
 	else
 		{
 		NRMat<T> tmp;
 		tmp.moveto(cpu);
 		s >> tmp;
 		tmp.moveto(x.getlocation());
 		x=tmp;
 		return s;
 		}
 #endif
 }
 //optional indexing from 1
@@ -671,6 +811,38 @@ NRMat<T> & NRMat<T>::operator^=(const NRMat<T>  &rhs){
 }
 #ifdef CUDALA
 template<typename T>
 void NRMat<T>::moveto(const GPUID dest)
 {
 if(location==dest) return;
 location=dest;
 if(v && !count) laerror("internal inconsistency of reference counting 1");
 if (!count) return;
 if(v && *count==0) laerror("internal inconsistency of reference counting 2");
 if(!v) return;
 T *vold = v;
 if(dest == cpu) //moving from GPU to CPU
 	{
 	v = new T[nn*mm];
 	gpuget(nn*mm,sizeof(T),vold,v);
 	if(*count == 1) gpufree(vold);
 	else {--(*count); count = new int(1);}
 	}
 else	//moving from CPU to GPU
 	{
 	v=(T *) gpualloc(nn*mm*sizeof(T));
 	gpuput(nn*mm,sizeof(T),vold,v);
 	if(*count == 1) delete[] vold;
 	else {--(*count); count = new int(1);}
 	}
 }
 #endif
 //end CUDALA
@@ -20,106 +20,186 @@
 #include "noncblas.h"
 #include "laerror.h"
 #include "mat.h"
 #include "fortran.h"
 #ifdef FORTRAN_
 #define FORNAME(x) x##_
 #else
 #define FORNAME(x) x
 #endif
 #ifdef NONCBLAS
 //Level 1 - straightforward wrappers
-extern "C" double FORNAME(ddot) (const int *n, const double *x, const int *incx, const double *y, const int *incy);
+extern "C" double FORNAME(ddot) (const FINT *n, const double *x, const FINT *incx, const double *y, const FINT *incy);
 double cblas_ddot(const int N, const double *X, const int incX,
                  const double *Y, const int incY)
 {
 #ifdef FORINT
 const FINT ntmp=N;
 const FINT incxtmp=incX;
 const FINT incytmp=incY;
 return FORNAME(ddot)(&ntmp,X,&incxtmp,Y,&incytmp);
 #else
 return FORNAME(ddot)(&N,X,&incX,Y,&incY);
 #endif
 }
-extern "C" void FORNAME(dscal) (const int *n, const double *a, double *x, const int *incx);
+extern "C" void FORNAME(dscal) (const FINT *n, const double *a, double *x, const FINT *incx);
 void cblas_dscal(const int N, const double alpha, double *X, const int incX)
 {
 #ifdef FORINT
 const FINT ntmp=N;
 const FINT incxtmp=incX;
 FORNAME(dscal) (&ntmp,&alpha,X,&incxtmp);
 #else
 FORNAME(dscal) (&N,&alpha,X,&incX);
 #endif
 }
-extern "C" void FORNAME(dcopy) (const int *n, const double *x, const int *incx, double *y, const int *incy);
+extern "C" void FORNAME(dcopy) (const FINT *n, const double *x, const FINT *incx, double *y, const FINT *incy);
 void cblas_dcopy(const int N, const double *X, const int incX,
                 double *Y, const int incY)
 {
 #ifdef FORINT
 const FINT ntmp=N;
 const FINT incxtmp=incX;
 const FINT incytmp=incY;
 FORNAME(dcopy) (&ntmp,X,&incxtmp,Y,&incytmp);
 #else
 FORNAME(dcopy) (&N,X,&incX,Y,&incY);
 #endif
 }
-extern "C" void FORNAME(daxpy) (const int *n, const double *a, const double *x, const int *incx, double *y, const int *incy);
+extern "C" void FORNAME(daxpy) (const FINT *n, const double *a, const double *x, const FINT *incx, double *y, const FINT *incy);
 void cblas_daxpy(const int N, const double alpha, const double *X,
                 const int incX, double *Y, const int incY)
 {
 #ifdef FORINT
 const FINT ntmp=N;
 const FINT incxtmp=incX;
 const FINT incytmp=incY;
 FORNAME(daxpy) (&ntmp,&alpha,X,&incxtmp,Y,&incytmp);
 #else
 FORNAME(daxpy) (&N,&alpha,X,&incX,Y,&incY);
 #endif
 }
-extern "C" double FORNAME(dnrm2) (const int *n, const double *x, const int *incx);
+extern "C" double FORNAME(dnrm2) (const FINT *n, const double *x, const FINT *incx);
 double cblas_dnrm2(const int N, const double *X, const int incX)
 {
 #ifdef FORINT
 const FINT ntmp=N;
 const FINT incxtmp=incX;
 return FORNAME(dnrm2) (&ntmp,X,&incxtmp);
 #else
 return FORNAME(dnrm2) (&N,X,&incX);
 #endif
 }
-extern "C" double FORNAME(dasum) (const int *n, const double *x, const int *incx);
+extern "C" double FORNAME(dasum) (const FINT *n, const double *x, const FINT *incx);
 double cblas_dasum(const int N, const double *X, const int incX)
 {
 #ifdef FORINT
 const FINT ntmp=N;
 const FINT incxtmp=incX;
 return FORNAME(dasum) (&ntmp,X,&incxtmp);
 #else
 return FORNAME(dasum) (&N,X,&incX);
 #endif
 }
-extern "C" void FORNAME(zcopy) (const int *n, const void *x, const int *incx, void *y, const int *incy);
+extern "C" void FORNAME(zcopy) (const FINT *n, const void *x, const FINT *incx, void *y, const FINT *incy);
 void cblas_zcopy(const int N, const void *X, const int incX,
                 void *Y, const int incY)
 {
 #ifdef FORINT
 const FINT ntmp=N;
 const FINT incxtmp=incX;
 const FINT incytmp=incY;
 FORNAME(zcopy) (&ntmp,X,&incxtmp,Y,&incytmp);
 #else
 FORNAME(zcopy) (&N,X,&incX,Y,&incY);
 #endif
 }
-extern "C" void FORNAME(zaxpy) (const int *n, const void *a, const void *x, const int *incx, void *y, const int *incy);
+extern "C" void FORNAME(zaxpy) (const FINT *n, const void *a, const void *x, const FINT *incx, void *y, const FINT *incy);
 void cblas_zaxpy(const int N, const void *alpha, const void *X,
                 const int incX, void *Y, const int incY)
 {
 #ifdef FORINT
 const FINT ntmp=N;
 const FINT incxtmp=incX;
 const FINT incytmp=incY;
 FORNAME(zaxpy) (&ntmp,alpha,X,&incxtmp,Y,&incytmp);
 #else
 FORNAME(zaxpy) (&N,alpha,X,&incX,Y,&incY);
 #endif
 }
-extern "C" void FORNAME(zscal) (const int *n, const void *a, void *x, const int *incx);
+extern "C" void FORNAME(zscal) (const FINT *n, const void *a, void *x, const FINT *incx);
 void cblas_zscal(const int N, const void *alpha, void *X, const int incX)
 {
 #ifdef FORINT
 const FINT ntmp=N;
 const FINT incxtmp=incX;
 FORNAME(zscal)(&ntmp,alpha,X,&incxtmp);
 #else
 FORNAME(zscal)(&N,alpha,X,&incX);
 #endif
 }
-extern "C" void FORNAME(zdscal) (const int *n, const double *a, void *x, const int *incx);
+extern "C" void FORNAME(zdscal) (const FINT *n, const double *a, void *x, const FINT *incx);
 void cblas_zdscal(const int N, const double alpha, void *X, const int incX)
 {
 #ifdef FORINT
 const FINT ntmp=N;
 const FINT incxtmp=incX;
 FORNAME(zdscal)(&ntmp,&alpha,X,&incxtmp);
 #else
 FORNAME(zdscal)(&N,&alpha,X,&incX);
 #endif
 }
-extern "C" double FORNAME(dznrm2) (const int *n, const void *x, const int *incx);
+extern "C" double FORNAME(dznrm2) (const FINT *n, const void *x, const FINT *incx);
 double cblas_dznrm2(const int N, const void *X, const int incX)
 {
 #ifdef FORINT
 const FINT ntmp=N;
 const FINT incxtmp=incX;
 return FORNAME(dznrm2) (&ntmp,X,&incxtmp);
 #else
 return FORNAME(dznrm2) (&N,X,&incX);
 #endif
 }
 //the following ones are f2c-compatible, but is it truly portable???
-extern "C" void FORNAME(zdotu) (void *retval, const int *n, const void *x, const int *incx, const void *y, const int *incy);
+extern "C" void FORNAME(zdotu) (void *retval, const FINT *n, const void *x, const FINT *incx, const void *y, const FINT *incy);
 void   cblas_zdotu_sub(const int N, const void *X, const int incX,
                       const void *Y, const int incY, void *dotu)
 {
 #ifdef FORINT
 const FINT ntmp=N;
 const FINT incxtmp=incX;
 const FINT incytmp=incY;
 FORNAME(zdotu) (dotu,&ntmp,X,&incxtmp,Y,&incytmp);
 #else
 FORNAME(zdotu) (dotu,&N,X,&incX,Y,&incY);
 #endif
 }
-extern "C" void FORNAME(zdotc) (void *retval, const int *n, const void *x, const int *incx, const void *y, const int *incy);
+extern "C" void FORNAME(zdotc) (void *retval, const FINT *n, const void *x, const FINT *incx, const void *y, const FINT *incy);
 void   cblas_zdotc_sub(const int N, const void *X, const int incX,
                       const void *Y, const int incY, void *dotc)
 {
 #ifdef FORINT
 const FINT ntmp=N;
 const FINT incxtmp=incX;
 const FINT incytmp=incY;
 FORNAME(zdotc) (dotc,&ntmp,X,&incxtmp,Y,&incytmp);
 #else
 FORNAME(zdotc) (dotc,&N,X,&incX,Y,&incY);
 #endif
 }
@@ -129,7 +209,7 @@ FORNAME(zdotc) (dotc,&N,X,&incX,Y,&incY);
 //enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, AtlasConj=114};
-extern "C" void FORNAME(dspmv) (const char *uplo, const int *n, const double *alpha, const double *ap, const double *x, const int *incx, const double *beta, double *y, const int *incy);
+extern "C" void FORNAME(dspmv) (const char *uplo, const FINT *n, const double *alpha, const double *ap, const double *x, const FINT *incx, const double *beta, double *y, const FINT *incy);
 void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
                 const int N, const double alpha, const double *Ap,
                 const double *X, const int incX,
@@ -137,11 +217,18 @@ void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
 {
 if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
 if(Uplo!=CblasLower) LA::laerror("CblasLower uplo asserted");
 #ifdef FORINT
 const FINT ntmp=N;
 const FINT incxtmp=incX;
 const FINT incytmp=incY;
 FORNAME(dspmv) ("U",&ntmp, &alpha, Ap, X, &incxtmp, &beta, Y, &incytmp);
 #else
 FORNAME(dspmv) ("U",&N, &alpha, Ap, X, &incX, &beta, Y, &incY);
 #endif
 }
-extern "C" void FORNAME(zhpmv) (const char *uplo, const int *n, const void *alpha, const void *ap, const void *x, const int *incx, const void *beta, void *y, const int *incy);
+extern "C" void FORNAME(zhpmv) (const char *uplo, const FINT *n, const void *alpha, const void *ap, const void *x, const FINT *incx, const void *beta, void *y, const FINT *incy);
 void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
                 const int N, const void *alpha, const void *Ap,
                 const void *X, const int incX,
@@ -149,20 +236,36 @@ void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
 {
 if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
 if(Uplo!=CblasLower) LA::laerror("CblasLower uplo asserted");
 #ifdef FORINT
 const FINT ntmp=N;
 const FINT incxtmp=incX;
 const FINT incytmp=incY;
 FORNAME(zhpmv) ("U",&ntmp, alpha, Ap, X, &incxtmp, beta, Y, &incytmp);
 #else
 FORNAME(zhpmv) ("U",&N, alpha, Ap, X, &incX, beta, Y, &incY);
 #endif
 }
 //Level 2 and Level 3 on general matrices - take into account the transposed storage of matrices in Fortran and C
-extern "C" void FORNAME(dger) (const int *m, const int *n, const double *alpha, const double *x, const int *incx, const double *y, const int *incy, double *a, const int *lda);
+extern "C" void FORNAME(dger) (const FINT *m, const FINT *n, const double *alpha, const double *x, const FINT *incx, const double *y, const FINT *incy, double *a, const FINT *lda);
 void cblas_dger(const enum CBLAS_ORDER Order, const int M, const int N,
                const double alpha, const double *X, const int incX,
                const double *Y, const int incY, double *A, const int lda)
 {
 if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
 //swap m-n, y-x
 #ifdef FORINT
 const FINT mtmp=M;
 const FINT ntmp=N;
 const FINT incxtmp=incX;
 const FINT incytmp=incY;
 const FINT ldatmp=lda;
 FORNAME(dger) (&ntmp, &mtmp, &alpha, Y, &incytmp, X, &incxtmp, A, &ldatmp);
 #else
 FORNAME(dger) (&N, &M, &alpha, Y, &incY, X, &incX, A, &lda);
 #endif
 }
 void cblas_zgerc(const enum CBLAS_ORDER Order, const int M, const int N,
@@ -179,7 +282,7 @@ void cblas_zgeru(const enum CBLAS_ORDER Order, const int M, const int N,
 LA::laerror("cblas_zgeru cannot be simply converted to fortran order");
 }
-extern "C" void FORNAME(dgemm) (const char *transa, const char *transb, const int *m, const int *n, const int *k, const double *alpha, const double *a, const int *lda, const double *b, const int *ldb, const double *beta, double *c, const int *ldc);
+extern "C" void FORNAME(dgemm) (const char *transa, const char *transb, const FINT *m, const FINT *n, const FINT *k, const double *alpha, const double *a, const FINT *lda, const double *b, const FINT *ldb, const double *beta, double *c, const FINT *ldc);
 void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
                 const int K, const double alpha, const double *A,
@@ -188,11 +291,22 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
 {
 if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
 //swap a-b, m-n
 #ifdef FORINT
 const FINT mtmp=M;
 const FINT ntmp=N;
 const FINT ktmp=K;
 const FINT ldatmp=lda;
 const FINT ldbtmp=ldb;
 const FINT ldctmp=ldc;
 FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T",
 	 &ntmp, &mtmp, &ktmp, &alpha, B, &ldbtmp, A, &ldatmp, &beta, C, &ldctmp);
 #else
 FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T",
 	 &N, &M, &K, &alpha, B, &ldb, A, &lda, &beta, C, &ldc);
 #endif
 }
-extern "C" void FORNAME(zgemm) (const char *transa, const char *transb, const int *m, const int *n, const int *k, const void *alpha, const void *a, const int *lda, const void *b, const int *ldb, const void *beta, void *c, const int *ldc);
+extern "C" void FORNAME(zgemm) (const char *transa, const char *transb, const FINT *m, const FINT *n, const FINT *k, const void *alpha, const void *a, const FINT *lda, const void *b, const FINT *ldb, const void *beta, void *c, const FINT *ldc);
 void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA,
                 const enum CBLAS_TRANSPOSE TransB, const int M, const int N,
                 const int K, const void *alpha, const void *A,
@@ -201,26 +315,49 @@ void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
 {
 if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
 //swap a-b, m-n
 #ifdef FORINT
 const FINT mtmp=M;
 const FINT ntmp=N;
 const FINT ktmp=K;
 const FINT ldatmp=lda;
 const FINT ldbtmp=ldb;
 const FINT ldctmp=ldc;
 FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
 		 TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
         &ntmp, &mtmp, &ktmp, alpha, B, &ldbtmp, A, &ldatmp, beta, C, &ldctmp);
 #else
 FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
 		 TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
         &N, &M, &K, alpha, B, &ldb, A, &lda, beta, C, &ldc);
 #endif
 }
-extern "C" void FORNAME(dgemv)  (const char *TRANS, const int *M, const int *N, const double *ALPHA, const double *A,  const int *LDA,  const double *X,  const int *INCX, const double *BETA, double *Y, const int *INCY);
+extern "C" void FORNAME(dgemv)  (const char *TRANS, const FINT *M, const FINT *N, const double *ALPHA, const double *A,  const FINT *LDA,  const double *X,  const FINT *INCX, const double *BETA, double *Y, const FINT *INCY);
 void cblas_dgemv(const enum CBLAS_ORDER Order,
                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
                 const double alpha, const double *A, const int lda,
                 const double *X, const int incX, const double beta,
                 double *Y, const int incY)
 {
 #ifdef FORINT
 const FINT mtmp=M;
 const FINT ntmp=N;
 const FINT ldatmp=lda;
 const FINT incxtmp=incX;
 const FINT incytmp=incY;
 if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
 //swap n-m and toggle transposition
 else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
 #else
 if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
 //swap n-m and toggle transposition
 else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
 #endif
 }
-extern "C" void FORNAME(zgemv)  (const char *TRANS, const int *M, const int *N, const void *ALPHA, const void *A,  const int *LDA,  const void *X,  const int *INCX, const void *BETA, void *Y, const int *INCY);
+extern "C" void FORNAME(zgemv)  (const char *TRANS, const FINT *M, const FINT *N, const void *ALPHA, const void *A,  const FINT *LDA,  const void *X,  const FINT *INCX, const void *BETA, void *Y, const FINT *INCY);
 void cblas_zgemv(const enum CBLAS_ORDER Order,
                 const enum CBLAS_TRANSPOSE TransA, const int M, const int N,
                 const void *alpha, const void *A, const int lda,
@@ -230,14 +367,29 @@ void cblas_zgemv(const enum CBLAS_ORDER Order,
 if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
 if(TransA == CblasConjTrans) LA::laerror("zgemv with CblasConjTrans not supportted");
 //swap n-m and toggle transposition
 #ifdef FORINT
 const FINT mtmp=M;
 const FINT ntmp=N;
 const FINT ldatmp=lda;
 const FINT incxtmp=incX;
 const FINT incytmp=incY;
 FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, alpha, A, &ldatmp, X, &incxtmp, beta, Y, &incytmp );
 #else
 FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, alpha, A, &lda, X, &incX, beta, Y, &incY );
 #endif
 }
-extern "C" int FORNAME(idamax) (const int *N, const double *DX, const int *INCX);
+extern "C" FINT FORNAME(idamax) (const FINT *N, const double *DX, const FINT *INCX);
 CBLAS_INDEX cblas_idamax(const int N, const double *X, const int incX)
 {
-return FORNAME(idamax)(&N,X,&incX);
+#ifdef FORINT
 const FINT ntmp=N;
 const FINT incxtmp=incX;
 return (CBLAS_INDEX)FORNAME(idamax)(&ntmp,X,&incxtmp);
 #else
 return (CBLAS_INDEX)FORNAME(idamax)(&N,X,&incX);
 #endif
 }
@@ -246,21 +398,38 @@ return FORNAME(idamax)(&N,X,&incX);
 #ifdef  NONCLAPACK
 //clapack_dgesv
 //allocate auxiliary storage and transpose input and output quantities to fortran/C order
-extern "C" void FORNAME(dgesv) (const int *N, const int *NRHS, double *A, const int *LDA, int *IPIV, double *B, const int *LDB, int *INFO);
+extern "C" void FORNAME(dgesv) (const FINT *N, const FINT *NRHS, double *A, const FINT *LDA, FINT *IPIV, double *B, const FINT *LDB, FINT *INFO);
 int clapack_dgesv(const enum CBLAS_ORDER Order, const int N, const int NRHS,
                  double *A, const int lda, int *ipiv,
                  double *B, const int ldb)
 {
-int INFO=0;
+FINT INFO=0;
 if(Order!=CblasRowMajor) LA::laerror("CblasRowMajor order asserted");
 //B should be in the same physical order, just transpose A in place and the LU result on output
 for(int i=1; i<N; ++i) for(int j=0; j<i; ++j)  {double t=A[j*lda+i]; A[j*lda+i]=A[i*lda+j]; A[i*lda+j]=t;}
 #ifdef FORINT
 const FINT ntmp=N;
 const FINT nrhstmp=NRHS;
 const FINT ldatmp=lda;
 const FINT ldbtmp=ldb;
 FINT ipivtmp=*ipiv;
 FORNAME(dgesv) (&ntmp,&nrhstmp,A,&ldatmp,&ipivtmp,B,&ldbtmp,&INFO);
 #else
 FORNAME(dgesv) (&N,&NRHS,A,&lda,ipiv,B,&ldb,&INFO);
 #endif
 for(int i=1; i<N; ++i) for(int j=0; j<i; ++j)  {double t=A[j*lda+i]; A[j*lda+i]=A[i*lda+j]; A[i*lda+j]=t;}
-return INFO;
+return (int)INFO;
 }
 #endif
 void cblas_dswap(const int N, double *X, const int incX,
                 double *Y, const int incY){
  LA::laerror("cblas_dswap is not available... (-DNONCBLAS)");
 }
 void cblas_zswap(const int N, void *X, const int incX,
                 void *Y, const int incY){
  LA::laerror("cblas_zswap is not available... (-DNONCBLAS)");
 }		 
@@ -23,14 +23,11 @@
 #include "mat.h"
 #include "nonclass.h"
 #include "qsort.h"
 #include "fortran.h"
 namespace LA {
 #ifdef FORTRAN_
 #define FORNAME(x) x##_
 #else
 #define FORNAME(x) x
 #endif
 #define INSTANTIZE(T) \
 template void lawritemat(FILE *file,const T *a,int r,int c,const char *form0, \
@@ -191,16 +188,23 @@ linear_solve_do(A,&B[0],1,A.nrows(),det,n);
 // Next routines are not available in clapack, fotran ones will be used with an
 // additional swap/transpose of outputs when needed
-extern "C" void FORNAME(dspsv)(const char *UPLO, const int *N, const int *NRHS,
+extern "C" void FORNAME(dspsv)(const char *UPLO, const FINT *N, const FINT *NRHS,
-		double *AP, int *IPIV, double *B, const int *LDB, int *INFO);
+		double *AP, FINT *IPIV, double *B, const FINT *LDB, FINT *INFO);
 static void linear_solve_do(NRSMat<double> &a, double *b, const int nrhs, const int ldb, double *det, int n)
 {
-	int r, *ipiv;
+	FINT r, *ipiv;
 	a.copyonwrite();
-	ipiv = new int[n];
+	ipiv = new FINT[n];
 	char U = 'U';
 #ifdef FORINT
        const FINT ntmp=n;
        const FINT nrhstmp=nrhs;
 	const FINT ldbtmp=ldb;
 	FORNAME(dspsv)(&U, &ntmp, &nrhstmp, a, ipiv, b, &ldbtmp,&r);
 #else	
 	FORNAME(dspsv)(&U, &n, &nrhs, a, ipiv, b, &ldb,&r);
 #endif	
 	if (r < 0) {
 		delete[] ipiv;
 		laerror("illegal argument in spsv() call of linear_solve()");
@@ -239,8 +243,8 @@ linear_solve_do(a,&B[0],1,a.nrows(),det,n);
 //other version of linear solver based on gesvx
 //------------------------------------------------------------------------------
-extern "C" void FORNAME(zgesvx)(const char *fact, const char *trans, const int *n, const int *nrhs, complex<double> *A,  const int *lda, complex<double> *AF, const int *ldaf, const int *ipiv, char *equed, double *R,double *C, complex<double> *B, const int *ldb, complex<double> *X, const int *ldx, double *rcond, double *ferr, double *berr, complex<double> *work, double *rwork, int *info);
+extern "C" void FORNAME(zgesvx)(const char *fact, const char *trans, const FINT *n, const FINT *nrhs, complex<double> *A,  const FINT *lda, complex<double> *AF, const FINT *ldaf, const FINT *ipiv, char *equed, double *R,double *C, complex<double> *B, const FINT *ldb, complex<double> *X, const FINT *ldx, double *rcond, double *ferr, double *berr, complex<double> *work, double *rwork, FINT *info);
-extern "C" void FORNAME(dgesvx)(const char *fact, const char *trans, const int *n, const int *nrhs, double *A,  const int *lda, double *AF, const int *ldaf, const int *ipiv, char *equed, double *R,double *C, double *B, const int *ldb, double *X, const int *ldx, double *rcond, double *ferr, double *berr, double *work, int *iwork, int *info);
+extern "C" void FORNAME(dgesvx)(const char *fact, const char *trans, const FINT *n, const FINT *nrhs, double *A,  const FINT *lda, double *AF, const FINT *ldaf, const FINT *ipiv, char *equed, double *R,double *C, double *B, const FINT *ldb, double *X, const FINT *ldx, double *rcond, double *ferr, double *berr, double *work, FINT *iwork, FINT *info);
 //------------------------------------------------------------------------------
 // solves set of linear equations using dgesvx
 // input:
@@ -270,18 +274,18 @@ int linear_solve_x(NRMat<double> &_A, double *_B, const int _rhsCount, const int
 	double *A;
 	double * const _A_data = (double*)_A;
-	int info;
+	FINT info;
-	const int nrhs	= _rhsCount;
+	const FINT nrhs	= _rhsCount;
-	const int n	= _eqCount;
+	const FINT n	= _eqCount;
-	int lda		= A_cols;
+	FINT lda	= A_cols;
-	const int ldaf	= lda;
+	const FINT ldaf	= lda;
 	double rcond;
 	double ferr[nrhs], berr[nrhs], work[4*n];
 	double R[n], C[n];
-	int *const iwork = new int[n];
+	FINT *const iwork = new FINT[n];
-	int *const ipiv  = new int[n];
+	FINT *const ipiv  = new FINT[n];
 	double *X  = new double[n*nrhs];
 	double *AF = new double[ldaf*n];
@@ -312,7 +316,7 @@ int linear_solve_x(NRMat<double> &_A, double *_B, const int _rhsCount, const int
 	if(_saveA){
 		delete[] A;
 	}
-	return info;
+	return (int)info;
 }
 //------------------------------------------------------------------------------
 // solves set of linear equations using zgesvx
@@ -343,18 +347,18 @@ int linear_solve_x(NRMat<complex<double> > &_A, complex<double> *_B, const int _
 	complex<double> *A;
 	complex<double> * const _A_data = (complex<double>*)_A;
-	int info;
+	FINT info;
-	const int nrhs	= _rhsCount;
+	const FINT nrhs	= _rhsCount;
-	const int n	= _eqCount;
+	const FINT n	= _eqCount;
-	int lda		= A_cols;
+	FINT lda	= A_cols;
-	const int ldaf	= lda;
+	const FINT ldaf	= lda;
 	double rcond;
 	double ferr[nrhs], berr[nrhs];
 	double R[n], C[n], rwork[2*n];
 	complex<double> work[2*n];
-	int *const ipiv = new int[n];
+	FINT *const ipiv = new FINT[n];
 	complex<double> *X  = new complex<double>[n*nrhs];
 	complex<double> *AF = new complex<double>[ldaf*n];
@@ -386,7 +390,7 @@ int linear_solve_x(NRMat<complex<double> > &_A, complex<double> *_B, const int _
 	if(_saveA){
 		delete[] A;
 	}
-	return info;
+	return (int)info;
 }
 //------------------------------------------------------------------------------
 // for given square matrices A, B computes X = AB^{-1} as follows
@@ -402,8 +406,8 @@ int linear_solve_x(NRMat<complex<double> > &_A, complex<double> *_B, const int _
 //------------------------------------------------------------------------------
 int multiply_by_inverse(NRMat<double> &_A, NRMat<double> &_B, bool _useEq, double *_rcond){
-	const int n = _A.nrows();
+	const FINT n = _A.nrows();
-	const int m = _A.ncols();
+	const FINT m = _A.ncols();
 	if(n != m || n != _B.nrows() || n != _B.ncols()){
 		laerror("multiply_by_inverse: incompatible matrices");
 	}
@@ -417,13 +421,13 @@ int multiply_by_inverse(NRMat<double> &_A, NRMat<double> &_B, bool _useEq, doubl
 	double * const B = (double*)_B;
 	_B.copyonwrite();//even if fact='N', call copyonwrite because the solution is going to be stored in _B
-	int info;
+	FINT info;
 	double rcond;
 	double ferr[n], berr[n], work[4*n];
 	double R[n], C[n];
-	int *const iwork = new int[n];
+	FINT *const iwork = new FINT[n];
-	int *const ipiv  = new int[n];
+	FINT *const ipiv  = new FINT[n];
 	double *X  = new double[n2];
 	double *AF = new double[n2];
@@ -437,7 +441,7 @@ int multiply_by_inverse(NRMat<double> &_A, NRMat<double> &_B, bool _useEq, doubl
 	delete[] iwork;delete[] ipiv;
 	delete[] AF;delete[] X;
-	return info;
+	return (int)info;
 }
 //------------------------------------------------------------------------------
 // for given square matrices A, B computes X = AB^{-1} as follows
@@ -453,8 +457,8 @@ int multiply_by_inverse(NRMat<double> &_A, NRMat<double> &_B, bool _useEq, doubl
 //------------------------------------------------------------------------------
 int multiply_by_inverse(NRMat<complex<double> > &_A, NRMat<complex<double> > &_B, bool _useEq, double *_rcond){
-	const int n = _A.nrows();
+	const FINT n = _A.nrows();
-	const int m = _A.ncols();
+	const FINT m = _A.ncols();
 	if(n != m || n != _B.nrows() || n != _B.ncols()){
 		laerror("multiply_by_inverse: incompatible matrices");
 	}
@@ -468,13 +472,13 @@ int multiply_by_inverse(NRMat<complex<double> > &_A, NRMat<complex<double> > &_B
 	complex<double> * const B = (complex<double>*)_B;
 	_B.copyonwrite();//even if fact='N', call copyonwrite because the solution is going to be stored in _B
-	int info;
+	FINT info;
 	double rcond;
 	double ferr[n], berr[n];
 	double R[n], C[n], rwork[2*n];
 	complex<double> work[2*n];
-	int *const ipiv = new int[n];
+	FINT *const ipiv = new FINT[n];
 	complex<double> *X  = new complex<double>[n2];
 	complex<double> *AF = new complex<double>[n2];
@@ -488,24 +492,24 @@ int multiply_by_inverse(NRMat<complex<double> > &_A, NRMat<complex<double> > &_B
 	delete[] ipiv;
 	delete[] AF;delete[] X;
-	return info;
+	return (int)info;
 }
 //------------------------------------------------------------------------------
-extern "C" void FORNAME(dsyev)(const char *JOBZ, const char *UPLO, const int *N,
+extern "C" void FORNAME(dsyev)(const char *JOBZ, const char *UPLO, const FINT *N,
-		double *A, const int *LDA, double *W, double *WORK, const int *LWORK, int *INFO);
+		double *A, const FINT *LDA, double *W, double *WORK, const FINT *LWORK, FINT *INFO);
-extern "C" void FORNAME(dsygv)(const int *ITYPE, const char *JOBZ, const char *UPLO, const int *N,
+extern "C" void FORNAME(dsygv)(const FINT *ITYPE, const char *JOBZ, const char *UPLO, const FINT *N,
-                double *A, const int *LDA, double *B, const int *LDB, double *W, double *WORK, const int *LWORK, int *INFO);
+                double *A, const FINT *LDA, double *B, const FINT *LDB, double *W, double *WORK, const FINT *LWORK, FINT *INFO);
 // a will contain eigenvectors (columns if corder==1), w eigenvalues
 void diagonalize(NRMat<double> &a, NRVec<double> &w, const bool eivec, 
 		const bool corder, int n, NRMat<double> *b, const int itype)
 {
-	int m = a.nrows();
+	FINT m = a.nrows();
 	if (m != a.ncols()) laerror("diagonalize() call with non-square matrix");
 	if (a.nrows() != w.size()) 
 		laerror("inconsistent dimension of eigenvalue vector in diagonalize()");
@@ -517,22 +521,38 @@ void diagonalize(NRMat<double> &a, NRVec<double> &w, const bool eivec,
 	w.copyonwrite();
 	if(b) b->copyonwrite();
-	int r = 0;
+	FINT r = 0;
 	char U ='U';
 	char vectors = 'V';
 	if (!eivec) vectors = 'N';
-	int LWORK = -1;
+	FINT LWORK = -1;
 	double WORKX;
-	int ldb=0; if(b) ldb=b->ncols();
+	FINT ldb=0; if(b) ldb=b->ncols();
 #ifdef FORINT
        const FINT itypetmp = itype;
 	FINT ntmp = n;
 	// First call is to determine size of workspace
 	if(b) FORNAME(dsygv)(&itypetmp,&vectors, &U, &ntmp, a, &m, *b, &ldb, w, &WORKX, &LWORK, &r );
 	else FORNAME(dsyev)(&vectors, &U, &ntmp, a, &m, w, &WORKX, &LWORK, &r );
 #else
 	// First call is to determine size of workspace
 	if(b) FORNAME(dsygv)(&itype,&vectors, &U, &n, a, &m, *b, &ldb, w, &WORKX, &LWORK, &r );
 	else FORNAME(dsyev)(&vectors, &U, &n, a, &m, w, &WORKX, &LWORK, &r );
-	LWORK = (int)WORKX;
+#endif
        LWORK = (FINT)WORKX;
 	double *WORK = new double[LWORK];
-	if(b) FORNAME(dsygv)(&itype,&vectors, &U, &n, a, &m, *b,&ldb, w, WORK, &LWORK, &r );
+
 #ifdef FORINT
 	if(b) FORNAME(dsygv)(&itypetmp,&vectors, &U, &ntmp, a, &m, *b, &ldb, w, &WORKX, &LWORK, &r );
        else FORNAME(dsyev)(&vectors, &U, &ntmp, a, &m, w, &WORKX, &LWORK, &r );
 #else
        if(b) FORNAME(dsygv)(&itype,&vectors, &U, &n, a, &m, *b,&ldb, w, WORK, &LWORK, &r );
 	else FORNAME(dsyev)(&vectors, &U, &n, a, &m, w, WORK, &LWORK, &r );
-	delete[] WORK;
+#endif
        delete[] WORK;
 	if (vectors == 'V' && corder) a.transposeme(n);
 	if (r < 0) laerror("illegal argument in sygv/syev in diagonalize()");
@@ -541,18 +561,18 @@ void diagonalize(NRMat<double> &a, NRVec<double> &w, const bool eivec,
-extern "C" void FORNAME(zheev)(const char *JOBZ, const char *UPLO, const int *N,
+extern "C" void FORNAME(zheev)(const char *JOBZ, const char *UPLO, const FINT *N,
-		complex<double> *A, const int *LDA, double *W, complex<double> *WORK, const int *LWORK, double *RWORK, int *INFO);
+		complex<double> *A, const FINT *LDA, double *W, complex<double> *WORK, const FINT *LWORK, double *RWORK, FINT *INFO);
-extern "C" void FORNAME(zhegv)(const int *ITYPE, const char *JOBZ, const char *UPLO, const int *N,
+extern "C" void FORNAME(zhegv)(const FINT *ITYPE, const char *JOBZ, const char *UPLO, const FINT *N,
-                complex<double> *A, const int *LDA, complex<double> *B, const int *LDB, double *W, complex<double> *WORK, const int *LWORK, double *RWORK, int *INFO);
+                complex<double> *A, const FINT *LDA, complex<double> *B, const FINT *LDB, double *W, complex<double> *WORK, const FINT *LWORK, double *RWORK, FINT *INFO);
 // a will contain eigenvectors (columns if corder==1), w eigenvalues
 void diagonalize(NRMat<complex<double> > &a, NRVec<double> &w, const bool eivec, 
 		const bool corder, int n, NRMat<complex<double> > *b, const int itype)
 {
-	int m = a.nrows();
+	FINT m = a.nrows();
 	if (m != a.ncols()) laerror("diagonalize() call with non-square matrix");
 	if (a.nrows() != w.size()) 
 		laerror("inconsistent dimension of eigenvalue vector in diagonalize()");
@@ -564,23 +584,38 @@ void diagonalize(NRMat<complex<double> > &a, NRVec<double> &w, const bool eivec,
 	w.copyonwrite();
 	if(b) b->copyonwrite();
-	int r = 0;
+	FINT r = 0;
 	char U ='U';
 	char vectors = 'V';
 	if (!eivec) vectors = 'N';
-	int LWORK = -1;
+	FINT LWORK = -1;
 	complex<double> WORKX;
-	int ldb=0; if(b) ldb=b->ncols();
+	FINT ldb=0; if(b) ldb=b->ncols();
 	// First call is to determine size of workspace
 	double *RWORK = new double[3*n+2]; 
-	if(b) FORNAME(zhegv)(&itype,&vectors, &U, &n, a, &m, *b, &ldb, w, &WORKX, &LWORK, RWORK, &r );
+#ifdef FORINT
        const FINT itypetmp = itype;
 	FINT ntmp = n;
        if(b) FORNAME(zhegv)(&itypetmp,&vectors, &U, &ntmp, a, &m, *b, &ldb, w, &WORKX, &LWORK, RWORK, &r );
 	else FORNAME(zheev)(&vectors, &U, &ntmp, a, &m, w, &WORKX, &LWORK, RWORK, &r );
 #else
        if(b) FORNAME(zhegv)(&itype,&vectors, &U, &n, a, &m, *b, &ldb, w, &WORKX, &LWORK, RWORK, &r );
 	else FORNAME(zheev)(&vectors, &U, &n, a, &m, w, &WORKX, &LWORK, RWORK, &r );
-	LWORK = (int)WORKX.real();
+#endif	
        LWORK = (FINT)WORKX.real();
 	complex<double> *WORK = new complex<double>[LWORK];
-	if(b) FORNAME(zhegv)(&itype,&vectors, &U, &n, a, &m, *b,&ldb, w, WORK, &LWORK, RWORK, &r );
+
 #ifdef FORINT
        if(b) FORNAME(zhegv)(&itypetmp,&vectors, &U, &ntmp, a, &m, *b, &ldb, w, &WORKX, &LWORK, RWORK, &r );
        else FORNAME(zheev)(&vectors, &U, &ntmp, a, &m, w, &WORKX, &LWORK, RWORK, &r );
 #else	
        if(b) FORNAME(zhegv)(&itype,&vectors, &U, &n, a, &m, *b,&ldb, w, WORK, &LWORK, RWORK, &r );
 	else FORNAME(zheev)(&vectors, &U, &n, a, &m, w, WORK, &LWORK, RWORK, &r );
-	delete[] WORK;
+#endif	
        delete[] WORK;
 	delete[] RWORK;
 	if (vectors == 'V' && corder) a.transposeme(n);
@@ -590,11 +625,11 @@ void diagonalize(NRMat<complex<double> > &a, NRVec<double> &w, const bool eivec,
-extern "C" void FORNAME(dspev)(const char *JOBZ, const char *UPLO, const int *N,
+extern "C" void FORNAME(dspev)(const char *JOBZ, const char *UPLO, const FINT *N,
-		double *AP, double *W, double *Z, const int *LDZ, double *WORK, int *INFO);
+		double *AP, double *W, double *Z, const FINT *LDZ, double *WORK, FINT *INFO);
-extern "C" void FORNAME(dspgv)(const int *ITYPE, const char *JOBZ, const char *UPLO, const int *N,
+extern "C" void FORNAME(dspgv)(const FINT *ITYPE, const char *JOBZ, const char *UPLO, const FINT *N,
-                double *AP, double *BP, double *W, double *Z, const int *LDZ, double *WORK, int *INFO);
+                double *AP, double *BP, double *W, double *Z, const FINT *LDZ, double *WORK, FINT *INFO);
 // v will contain eigenvectors, w eigenvalues
@@ -613,14 +648,21 @@ void diagonalize(NRSMat<double> &a, NRVec<double> &w, NRMat<double> *v,
 	if(v) v->copyonwrite();
 	if(b) b->copyonwrite();
-	int r = 0;
+	FINT r = 0;
 	char U = 'U';
 	char job = v ? 'v' : 'n';
 	double *WORK = new double[3*n];
-	int ldv=v?v->ncols():n;
+	FINT ldv=v?v->ncols():n;
-	if(b) FORNAME(dspgv)(&itype,&job, &U, &n, a, *b, w, v?(*v)[0]:(double *)0, &ldv, WORK,  &r );
+#ifdef FORINT
        const FINT itypetmp = itype;
 	FINT ntmp = n;
        if(b) FORNAME(dspgv)(&itypetmp,&job, &U, &ntmp, a, *b, w, v?(*v)[0]:(double *)0, &ldv, WORK,  &r );
 	else FORNAME(dspev)(&job, &U, &ntmp, a, w, v?(*v)[0]:(double *)0, &ldv, WORK,  &r );
 #else
        if(b) FORNAME(dspgv)(&itype,&job, &U, &n, a, *b, w, v?(*v)[0]:(double *)0, &ldv, WORK,  &r );
 	else FORNAME(dspev)(&job, &U, &n, a, w, v?(*v)[0]:(double *)0, &ldv, WORK,  &r );
 #endif
 	delete[] WORK;
 	if (v && corder) v->transposeme(n);
@@ -629,11 +671,11 @@ void diagonalize(NRSMat<double> &a, NRVec<double> &w, NRMat<double> *v,
 }
-extern "C" void FORNAME(zhpev)(const char *JOBZ, const char *UPLO, const int *N,
+extern "C" void FORNAME(zhpev)(const char *JOBZ, const char *UPLO, const FINT *N,
-		complex<double> *AP, double *W, complex<double> *Z, const int *LDZ, complex<double> *WORK, double *RWORK, int *INFO);
+		complex<double> *AP, double *W, complex<double> *Z, const FINT *LDZ, complex<double> *WORK, double *RWORK, FINT *INFO);
-extern "C" void FORNAME(zhpgv)(const int *ITYPE, const char *JOBZ, const char *UPLO, const int *N,
+extern "C" void FORNAME(zhpgv)(const FINT *ITYPE, const char *JOBZ, const char *UPLO, const FINT *N,
-                complex<double> *AP, complex<double> *BP, double *W, complex<double> *Z, const int *LDZ, complex<double> *WORK,  double *RWORK, int *INFO);
+                complex<double> *AP, complex<double> *BP, double *W, complex<double> *Z, const FINT *LDZ, complex<double> *WORK,  double *RWORK, FINT *INFO);
 // v will contain eigenvectors, w eigenvalues
@@ -652,15 +694,22 @@ void diagonalize(NRSMat<complex<double> > &a, NRVec<double> &w, NRMat<complex<do
 	if(v) v->copyonwrite();
 	if(b) b->copyonwrite();
-	int r = 0;
+	FINT r = 0;
 	char U = 'U';
 	char job = v ? 'v' : 'n';
 	complex<double> *WORK = new complex<double>[2*n];
 	double *RWORK  = new double[3*n];
-	int ldv=v?v->ncols():n;
+	FINT ldv=v?v->ncols():n;
 #ifdef FORINT	
        const FINT itypetmp = itype;
        FINT ntmp = n;
 	if(b) FORNAME(zhpgv)(&itypetmp,&job, &U, &ntmp, a, *b, w, v?(*v)[0]:(complex<double> *)0, &ldv, WORK, RWORK,  &r );
 	else FORNAME(zhpev)(&job, &U, &ntmp, a, w, v?(*v)[0]:(complex<double> *)0, &ldv, WORK, RWORK,  &r );
 #else
 	if(b) FORNAME(zhpgv)(&itype,&job, &U, &n, a, *b, w, v?(*v)[0]:(complex<double> *)0, &ldv, WORK, RWORK,  &r );
 	else FORNAME(zhpev)(&job, &U, &n, a, w, v?(*v)[0]:(complex<double> *)0, &ldv, WORK, RWORK,  &r );
 #endif
 	delete[] WORK;
 	delete[] RWORK;
 	if (v && corder) v->transposeme(n);
@@ -671,17 +720,17 @@ void diagonalize(NRSMat<complex<double> > &a, NRVec<double> &w, NRMat<complex<do
-extern "C" void FORNAME(dgesvd)(const char *JOBU,  const char *JOBVT,  const int *M,
+extern "C" void FORNAME(dgesvd)(const char *JOBU,  const char *JOBVT,  const FINT *M,
-		const int *N,  double *A, const int *LDA, double *S, double *U, const int *LDU,
+		const FINT *N,  double *A, const FINT *LDA, double *S, double *U, const FINT *LDU,
-		double *VT, const int *LDVT, double *WORK, const int *LWORK, int *INFO );
+		double *VT, const FINT *LDVT, double *WORK, const FINT *LWORK, FINT *INFO );
 void singular_decomposition(NRMat<double> &a, NRMat<double> *u, NRVec<double> &s,
 		NRMat<double> *v, const bool corder, int m, int n)
 {
-	int m0 = a.nrows();
+	FINT m0 = a.nrows();
-	int n0 = a.ncols();
+	FINT n0 = a.ncols();
-	if(m<=0) m=m0;
+	if(m<=0) m=(int)m0;
-	if(n<=0) n=n0;
+	if(n<=0) n=(int)n0;
 	if(n>n0 || m>m0) laerror("bad dimension in singular_decomposition");
 	if (u) if (m > u->nrows() || m> u->ncols())
 		laerror("inconsistent dimension of U Mat in singular_decomposition()");
@@ -700,14 +749,30 @@ void singular_decomposition(NRMat<double> &a, NRMat<double> *u, NRVec<double> &s
 	char jobu = u ? 'A' : 'N';
 	char jobv = v ? 'A' : 'N';
 	double work0;
-	int lwork = -1;
+	FINT lwork = -1;
-	int r;
+	FINT r;
 #ifdef FORINT
        FINT ntmp = n;
 	FINT mtmp = m;
 	FORNAME(dgesvd)(&jobv, &jobu, &ntmp, &mtmp, a, &n0, s, v?(*v)[0]:0, &n0,
 			u?(*u)[0]:0, &m0, &work0, &lwork, &r);
 #else
 	FORNAME(dgesvd)(&jobv, &jobu, &n, &m, a, &n0, s, v?(*v)[0]:0, &n0,
 			u?(*u)[0]:0, &m0, &work0, &lwork, &r);
-	lwork = (int) work0;
+#endif
 	lwork = (FINT) work0;
 	double *work = new double[lwork];
 #ifdef FORINT
 	FORNAME(dgesvd)(&jobv, &jobu, &ntmp, &mtmp, a, &n0, s, v?(*v)[0]:0, &n0,
 			u?(*u)[0]:0, &m0, work, &lwork, &r);
 #else
 	FORNAME(dgesvd)(&jobv, &jobu, &n, &m, a, &n0, s, v?(*v)[0]:0, &n0,
 			u?(*u)[0]:0, &m0, work, &lwork, &r);
 #endif
 	delete[] work;
 	if (v && corder) v->transposeme(n);
@@ -719,14 +784,14 @@ void singular_decomposition(NRMat<double> &a, NRMat<double> *u, NRVec<double> &s
 //extern "C" void FORNAME(dgeqrf)(const int *M, const int *N, double *A, const int *LDA, double *TAU, double *WORK, int *LWORK, int *INFO);
-extern "C" void FORNAME(dgeev)(const char *JOBVL, const char *JOBVR, const int *N,
+extern "C" void FORNAME(dgeev)(const char *JOBVL, const char *JOBVR, const FINT *N,
-		double *A, const int *LDA, double *WR, double *WI, double *VL, const int *LDVL,
+		double *A, const FINT *LDA, double *WR, double *WI, double *VL, const FINT *LDVL,
-		double *VR, const int *LDVR, double *WORK, const int *LWORK, int *INFO );
+		double *VR, const FINT *LDVR, double *WORK, const FINT *LWORK, FINT *INFO );
-extern "C" void FORNAME(dggev)(const char *JOBVL, const char *JOBVR, const int *N,
+extern "C" void FORNAME(dggev)(const char *JOBVL, const char *JOBVR, const FINT *N,
-		double *A, const int *LDA, double *B, const int *LDB, double *WR, double *WI,  double *WBETA, 
+		double *A, const FINT *LDA, double *B, const FINT *LDB, double *WR, double *WI,  double *WBETA, 
-		 double *VL, const int *LDVL,  double *VR, const int *LDVR,  
+		 double *VL, const FINT *LDVL,  double *VR, const FINT *LDVR,  
-		double *WORK, const int *LWORK, int *INFO );
+		double *WORK, const FINT *LWORK, FINT *INFO );
 //statics for sorting
@@ -802,23 +867,41 @@ void gdiagonalize(NRMat<double> &a, NRVec<double> &wr, NRVec<double> &wi,
 	char jobvl = vl ? 'V' : 'N';
 	char jobvr = vr ? 'V' : 'N';
 	double work0;
-	int lwork = -1;
+	FINT lwork = -1;
-	int r;
+	FINT r;
-	int lda=a.ncols();
+	FINT lda=a.ncols();
-	int ldb=0;
+	FINT ldb=0;
 	if(b) ldb=b->ncols();
-	int ldvl= vl?vl->ncols():lda;
+	FINT ldvl= vl?vl->ncols():lda;
-	int ldvr= vr?vr->ncols():lda;
+	FINT ldvr= vr?vr->ncols():lda;
-	if(b)  FORNAME(dggev)(&jobvr, &jobvl, &n, a, &lda, *b, &ldb, wr, wi, *beta, vr?vr[0]:(double *)0,
+
 #ifdef FORINT
        FINT ntmp = n; 
        if(b)  FORNAME(dggev)(&jobvr, &jobvl, &ntmp, a, &lda, *b, &ldb, wr, wi, *beta, vr?vr[0]:(double *)0,
                        &ldvr, vl?vl[0]:(double *)0, &ldvl, &work0, &lwork, &r);
 	else FORNAME(dgeev)(&jobvr, &jobvl, &ntmp, a, &lda, wr, wi, vr?vr[0]:(double *)0,
 			&ldvr, vl?vl[0]:(double *)0, &ldvl, &work0, &lwork, &r);
 #else
        if(b)  FORNAME(dggev)(&jobvr, &jobvl, &n, a, &lda, *b, &ldb, wr, wi, *beta, vr?vr[0]:(double *)0,
                        &ldvr, vl?vl[0]:(double *)0, &ldvl, &work0, &lwork, &r);
 	else FORNAME(dgeev)(&jobvr, &jobvl, &n, a, &lda, wr, wi, vr?vr[0]:(double *)0,
 			&ldvr, vl?vl[0]:(double *)0, &ldvl, &work0, &lwork, &r);
-	lwork = (int) work0;
+#endif
        lwork = (FINT) work0;
 	double *work = new double[lwork];
 #ifdef FORINT	
 	if(b) FORNAME(dggev)(&jobvr, &jobvl, &ntmp, a, &lda, *b, &ldb, wr, wi, *beta, vr?vr[0]:(double *)0,
                        &ldvr, vl?vl[0]:(double *)0, &ldvl, work, &lwork, &r);
 	else FORNAME(dgeev)(&jobvr, &jobvl, &ntmp, a, &lda, wr, wi, vr?vr[0]:(double *)0,
 			&ldvr, vl?vl[0]:(double *)0, &ldvl, work, &lwork, &r);
 #else
 	if(b) FORNAME(dggev)(&jobvr, &jobvl, &n, a, &lda, *b, &ldb, wr, wi, *beta, vr?vr[0]:(double *)0,
                        &ldvr, vl?vl[0]:(double *)0, &ldvl, work, &lwork, &r);
 	else FORNAME(dgeev)(&jobvr, &jobvl, &n, a, &lda, wr, wi, vr?vr[0]:(double *)0,
 			&ldvr, vl?vl[0]:(double *)0, &ldvl, work, &lwork, &r);
 #endif
 	delete[] work;
 //std::cout <<"TEST dgeev\n"<<wr<<wi<<*vr<<*vl<<std::endl;
@@ -1208,16 +1291,16 @@ return r;
 //Cholesky interface
-extern "C" void FORNAME(dpotrf)(const char *UPLO, const int *N, double *A, const int *LDA, int *INFO);
+extern "C" void FORNAME(dpotrf)(const char *UPLO, const FINT *N, double *A, const FINT *LDA, FINT *INFO);
-extern "C" void FORNAME(zpotrf)(const char *UPLO, const int *N, complex<double> *A, const int *LDA, int *INFO);
+extern "C" void FORNAME(zpotrf)(const char *UPLO, const FINT *N, complex<double> *A, const FINT *LDA, FINT *INFO);
 void cholesky(NRMat<double> &a, bool upper)
 {
 if(a.nrows()!=a.ncols()) laerror("matrix must be square in Cholesky");
-int lda=a.ncols();
+FINT lda=a.ncols();
-int n=a.nrows();
+FINT n=a.nrows();
 char uplo=upper?'U':'L';
-int info;
+FINT info;
 a.copyonwrite();
 FORNAME(dpotrf)(&uplo, &n, a, &lda, &info);
 if(info) {std::cerr << "Lapack error "<<info<<std::endl; laerror("error in Cholesky");}
@@ -1233,10 +1316,10 @@ else
 void cholesky(NRMat<complex<double> > &a, bool upper)
 {
 if(a.nrows()!=a.ncols()) laerror("matrix must be square in Cholesky");
-int lda=a.ncols();
+FINT lda=a.ncols();
-int n=a.nrows();
+FINT n=a.nrows();
 char uplo=upper?'U':'L';
-int info;
+FINT info;
 a.copyonwrite();
 a.transposeme();//switch to Fortran order
 FORNAME(zpotrf)(&uplo, &n, a, &lda, &info);
@@ -1250,14 +1333,14 @@ else
 //various norms
-extern "C" double FORNAME(zlange)( const char *NORM, const int *M, const int *N, complex<double> *A, const int *LDA, double *WORK); 
+extern "C" double FORNAME(zlange)( const char *NORM, const FINT *M, const FINT *N, complex<double> *A, const FINT *LDA, double *WORK); 
-extern "C" double FORNAME(dlange)( const char *NORM, const int *M, const int *N, double *A, const int *LDA, double *WORK); 
+extern "C" double FORNAME(dlange)( const char *NORM, const FINT *M, const FINT *N, double *A, const FINT *LDA, double *WORK); 
 double MatrixNorm(NRMat<complex<double> > &A, const char norm)
 {
        const char TypNorm = (tolower(norm) == 'o')?'I':'O'; //switch c-order/fortran-order
-        const int M = A.nrows();
+        const FINT M = A.nrows();
-        const int N = A.ncols();
+        const FINT N = A.ncols();
        double work[M];
        const double ret = FORNAME(zlange)(&TypNorm, &M, &N, A[0], &M, &work[0]);
        return ret;
@@ -1266,8 +1349,8 @@ double MatrixNorm(NRMat<complex<double> > &A, const char norm)
 double MatrixNorm(NRMat<double > &A, const char norm)
 {
        const char TypNorm = (tolower(norm) == 'o')?'I':'O'; //switch c-order/fortran-order
-        const int M = A.nrows();
+        const FINT M = A.nrows();
-        const int N = A.ncols();
+        const FINT N = A.ncols();
        double work[M];
        const double ret = FORNAME(dlange)(&TypNorm, &M, &N, A[0], &M, &work[0]);
        return ret;
@@ -1276,15 +1359,15 @@ double MatrixNorm(NRMat<double > &A, const char norm)
 //condition number
-extern "C" void FORNAME(zgecon)( const char *norm, const int *n, complex<double> *A, const int *LDA, const double *anorm, double *rcond, complex<double> *work, double *rwork, int *info);
+extern "C" void FORNAME(zgecon)( const char *norm, const FINT *n, complex<double> *A, const FINT *LDA, const double *anorm, double *rcond, complex<double> *work, double *rwork, FINT *info);
-extern "C" void FORNAME(dgecon)( const char *norm, const int *n, double *A, const int *LDA, const double *anorm, double *rcond, double *work, double *rwork, int *info);
+extern "C" void FORNAME(dgecon)( const char *norm, const FINT *n, double *A, const FINT *LDA, const double *anorm, double *rcond, double *work, double *rwork, FINT *info);
 double CondNumber(NRMat<complex<double> > &A, const char norm)
 {
        const char TypNorm = (tolower(norm) == 'o')?'I':'O'; //switch c-order/fortran-order
-        const int N = A.nrows();
+        const FINT N = A.nrows();
        double Norma(0.0), ret(0.0);
-        int info;
+        FINT info;
        complex<double> *work;
        double *rwork;
@@ -1305,9 +1388,9 @@ double CondNumber(NRMat<complex<double> > &A, const char norm)
 double CondNumber(NRMat<double> &A, const char norm)
 {
        const char TypNorm = (tolower(norm) == 'o')?'I':'O'; //switch c-order/fortran-order
-        const int N = A.nrows();
+        const FINT N = A.nrows();
        double Norma(0.0), ret(0.0);
-        int info;
+        FINT info;
        double *work;
        double *rwork;
@@ -44,6 +44,16 @@ namespace LA {
 template <typename T>
 void NRSMat<T>::put(int fd, bool dim, bool transp) const
 {
 #ifdef CUDALA
 if(location!=cpu)
        {
        NRSMat<T> tmp= *this;
        tmp.moveto(cpu);
        tmp.put(fd,dim,transp);
        return;
        }
 #endif
 errno=0;
 if(dim)
 {
@@ -56,6 +66,18 @@ LA_traits<T>::multiput(NN2,fd,v,dim);
 template <typename T>
 void NRSMat<T>::get(int fd, bool dim, bool transp)
 {
 #ifdef CUDALA
 if(location!=cpu)
        {
        NRSMat<T> tmp;
        tmp.moveto(cpu);
        tmp.get(fd,dim,transp);
        tmp.moveto(location);
        *this = tmp;
        return;
        }
 #endif
 int nn0[2]; //align at least 8-byte
 errno=0;
 if(dim)
@@ -402,6 +424,9 @@ cblas_dcopy(nn*(nn+1)/2,&rhs(0,0),1,((double *)v) + (imagpart?1:0),2);
 }
 //some template specializations leading to BLAS/CUBLAS calls
 //////////////////////////////////////////////////////////////////////////////
@@ -29,12 +29,20 @@ protected:
 	int nn;
 	T *v;
 	int *count;
 #ifdef CUDALA
       GPUID location;
 #endif
 public:
 	friend class NRVec<T>;
 	friend class NRMat<T>;
-	inline NRSMat() : nn(0),v(0),count(0) {};
+	inline NRSMat() : nn(0),v(0),count(0) 
-	inline explicit NRSMat(const int n);			// Zero-based array
+                        {
 #ifdef CUDALA
                        location = DEFAULT_LOC;
 #endif
                        };
 	inline explicit NRSMat(const int n, const GPUID loc= undefined);// Zero-based array
 	inline NRSMat(const T &a, const int n);	//Initialize to constant
 	inline NRSMat(const T *a, const int n);	// Initialize to array
 	inline NRSMat(const NRSMat &rhs);		// Copy constructor
@@ -45,6 +53,13 @@ public:
 	NRSMat & operator=(const NRSMat &rhs);	//assignment
        void randomize(const typename LA_traits<T>::normtype &x);
 	NRSMat & operator=(const T &a);		//assign a to diagonal
 #ifdef CUDALA
        inline GPUID getlocation() const {return location;}
        void moveto(const GPUID dest);
 #else
        inline GPUID getlocation() const {return cpu;}
        void moveto(const GPUID dest) {};
 #endif
        const bool operator!=(const NRSMat &rhs) const {if(nn!=rhs.nn) return 1; return LA_traits<T>::gencmp(v,rhs.v,NN2);} //memcmp for scalars else elementwise
        const bool operator==(const NRSMat &rhs) const {return !(*this != rhs);};
 	inline NRSMat & operator*=(const T &a);
@@ -65,8 +80,8 @@ public:
 	const NRMat<T> operator*(const NRMat<T> &rhs) const; // SMat*Mat 
 	const T dot(const NRSMat &rhs) const; // Smat.Smat//@@@for complex do conjugate
 	const T dot(const NRVec<T> &rhs) const; //Smat(as vec).vec //@@@for complex do conjugate
-	const NRVec<T> operator*(const NRVec<T> &rhs) const {NRVec<T> result(nn); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
+	const NRVec<T> operator*(const NRVec<T> &rhs) const {NRVec<T> result(nn,rhs.getlocation()); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
-	const NRVec<complex<T> > operator*(const NRVec<complex<T> > &rhs) const {NRVec<complex<T> > result(nn); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
+	const NRVec<complex<T> > operator*(const NRVec<complex<T> > &rhs) const {NRVec<complex<T> > result(nn,rhs.getlocation()); result.gemv((T)0,*this,'n',(T)1,rhs); return result;}; // Mat * Vec
 	const T* diagonalof(NRVec<T> &, const bool divide=0, bool cache=false) const; //get diagonal
 	void gemv(const T beta, NRVec<T> &r, const char trans, const T alpha, const NRVec<T> &x) const {r.gemv(beta,*this,trans,alpha,x);};
 	void gemv(const T beta, NRVec<complex<T> > &r, const char trans, const T alpha, const NRVec<complex<T> > &x) const {r.gemv(beta,*this,trans,alpha,x);};
@@ -108,29 +123,63 @@ namespace LA {
 // ctors
 template <typename T>
-inline NRSMat<T>::NRSMat(const int n) : nn(n), v(new T[NN2]),
+inline NRSMat<T>::NRSMat(const int n, const GPUID loc) : nn(n), count(new int(1)) 
 				count(new int) {*count = 1;}
 template <typename T>
 inline NRSMat<T>::NRSMat(const T& a, const int n) : nn(n),
 	        v(new T[NN2]), count(new int)
 {
-	*count =1;
+#ifdef CUDALA
-	if(a != (T)0) for(int i=0; i<NN2; i++) v[i] = a;
+        location= (loc==undefined?DEFAULT_LOC:loc);
-	else memset(v, 0, NN2*sizeof(T));
+        if(location==cpu)
 #endif
 	v=new T[NN2];
 #ifdef CUDALA
 	else v= (T*) gpualloc(NN2*sizeof(T));
 #endif
 }
 template <typename T>
-inline NRSMat<T>::NRSMat(const T *a, const int n) : nn(n),
+inline NRSMat<T>::NRSMat(const T& a, const int n) : nn(n), count(new int(1))
 	        v(new T[NN2]), count(new int)
 {
-	*count = 1;
+#ifdef CUDALA
-	memcpy(v, a, NN2*sizeof(T));
+        location=DEFAULT_LOC;
 	if(location==cpu)
 #endif
        {
 	v=new T[NN2];
 	if(a != (T)0) for(int i=0; i<NN2; i++) v[i] = a;
 	else memset(v, 0, NN2*sizeof(T));
 	}
 #ifdef CUDALA
 	else
 	{
 	v= (T*) gpualloc(NN2*sizeof(T));
 	cublasSetVector(NN2,sizeof(T),&a,0,v,1);
 	}
 #endif
 }
 template <typename T>
 inline NRSMat<T>::NRSMat(const T *a, const int n) : nn(n), count(new int(1))
 {
 #ifdef CUDALA
        location=DEFAULT_LOC;
        if(location==cpu)
 #endif
 		memcpy(v, a, NN2*sizeof(T));
 #ifdef CUDALA
        else
        {
        v= (T*) gpualloc(NN2*sizeof(T));
        cublasSetVector(NN2,sizeof(T),a,1,v,1);
        }
 #endif
 }
 template <typename T>
 inline NRSMat<T>::NRSMat(const NRSMat<T> &rhs) //copy constructor
 {
 #ifdef CUDALA
        location=rhs.location;
 #endif
 	v = rhs.v;
 	nn = rhs.nn;
 	count = rhs.count;
@@ -140,6 +189,9 @@ inline NRSMat<T>::NRSMat(const NRSMat<T> &rhs) //copy constructor
 template <typename T>
 NRSMat<T>::NRSMat(const NRVec<T> &rhs, const int n) // type conversion
 {
 #ifdef CUDALA
        location=rhs.location;
 #endif
 	nn = n;
 #ifdef DEBUG
 	if (NN2 != rhs.size())
@@ -150,6 +202,7 @@ NRSMat<T>::NRSMat(const NRVec<T> &rhs, const int n) // type conversion
 	(*count)++;
 }
 // S *= a
 template<>
 inline NRSMat<double> & NRSMat<double>::operator*=(const double & a)
@@ -437,33 +490,31 @@ NRSMat<T>::~NRSMat()
 {
        if (!count) return;
        if (--(*count) <= 0) {
-                if (v) delete[] (v);
+                if (v) 
 		    {
 #ifdef CUDALA
                    if(location==cpu)
 #endif
 			delete[] v;
 #ifdef CUDALA
                        else gpufree(v);
 #endif
                    }
                delete count;
        }
 }
 // assignment with a physical copy
 template <typename T>
 NRSMat<T> & NRSMat<T>::operator|=(const NRSMat<T> &rhs)
 {
-        if (this != &rhs) {
+#ifdef DEBUG
-                if(!rhs.v) laerror("unallocated rhs in NRSMat operator |=");
+                if (!rhs.v) laerror("unallocated rhs in NRSMat operator |=");
-                if(count)
+#endif
-                        if(*count > 1) {        // detach from the other
+        if (this == &rhs) return *this;
-                                --(*count);
+        *this = rhs;
-                                nn = 0;
+        this->copyonwrite();
                                count = 0;
                                v = 0;
                        }
                if (nn != rhs.nn) {
                        if(v) delete [] (v);
                        nn = rhs.nn;
                }
                if (!v) v = new T[NN2];
                if (!count) count = new int;
                *count = 1;
                memcpy(v, rhs.v, NN2*sizeof(T));
        }
        return *this;
 }
@@ -474,13 +525,24 @@ NRSMat<T> & NRSMat<T>::operator=(const NRSMat<T> & rhs)
 {
        if (this == & rhs) return *this;
        if (count)
-                if(--(*count) == 0) {
+                if(--(*count) == 0) 
-                        delete [] v;
+			{
 #ifdef CUDALA
        		if(location==cpu)
 #endif
                        	delete [] v;
 #ifdef CUDALA
 		        else
               			 gpufree(v);
 #endif
                        delete count;
-                }
+                	}
        v = rhs.v;
        nn = rhs.nn;
        count = rhs.count;
 #ifdef CUDALA
        location=rhs.location;
 #endif
        if (count) (*count)++;
        return *this;
 }
@@ -495,9 +557,24 @@ void NRSMat<T>::copyonwrite()
                (*count)--;
                count = new int;
                *count = 1;
-                T *newv = new T[NN2];
+		T *newv;
-                memcpy(newv, v, NN2*sizeof(T));
+#ifdef CUDALA
-                v = newv;
+	    if(location==cpu)
       		 {
 #endif
                 newv = new T[NN2];
                 memcpy(newv, v, NN2*sizeof(T));
 #ifdef CUDALA
 	        }
     	else
        	{
        	 newv = (T *) gpualloc(NN2*sizeof(T));
         	if(sizeof(T)%sizeof(float)!=0) laerror("cpu memcpy alignment problem");
         	cublasScopy(NN2*sizeof(T)/sizeof(float),(const float *) v,1,(float *)newv,1);
        	}
 #endif
                 v = newv;
        }
 }
@@ -514,7 +591,16 @@ void NRSMat<T>::resize(const int n)
 	    	if(n==0)
 	        {
 	        if(--(*count) <= 0) {
-	                if(v) delete[] (v);
+			if(v) {
 #ifdef CUDALA
                        if(location==cpu)
 #endif
 		                 delete[] (v);
 #ifdef CUDALA
                        else
                                gpufree(v);
 #endif
 			}
 	                delete count;
 	                }
 	        count=0;
@@ -534,16 +620,71 @@ void NRSMat<T>::resize(const int n)
                count = new int;
                *count = 1;
                nn = n;
 #ifdef CUDALA
     if(location==cpu)
 #endif
                v = new T[NN2];
 #ifdef CUDALA
      else
        v = (T*) gpualloc(NN2*sizeof(T));
 #endif
                return;
        }
        if (n != nn) {
-                nn = n;
+              nn = n;
-                delete[] v;
+#ifdef CUDALA
-                v = new T[NN2];
+	     if(location==cpu)
 #endif
 			{
                	delete[] v;
                	v = new T[NN2];
 			}
 #ifdef CUDALA
 	      else
       			 {
 			 gpufree(v);
 			 v = (T*) gpualloc(NN2*sizeof(T));
 		         }
 #endif
        }
 }
 #ifdef CUDALA
 template<typename T>
 void NRSMat<T>::moveto(const GPUID dest)
 {
 if(location==dest) return;
 location=dest;
 if(v && !count) laerror("internal inconsistency of reference counting 1");
 if (!count) return;
 if(v && *count==0) laerror("internal inconsistency of reference counting 2");
 if(!v) return;
 T *vold = v;
 if(dest == cpu) //moving from GPU to CPU
        {
        v = new T[NN2];
        gpuget(NN2,sizeof(T),vold,v);
        if(*count == 1) gpufree(vold);
        else {--(*count); count = new int(1);}
        }
 else    //moving from CPU to GPU
        {
        v=(T *) gpualloc(NN2*sizeof(T));
        gpuput(NN2,sizeof(T),vold,v);
        if(*count == 1) delete[] vold;
        else {--(*count); count = new int(1);}
        }
 }
 #endif
 template<typename T>
 NRSMat<complex<T> > complexify(const NRSMat<T> &rhs)
@@ -554,10 +695,15 @@ for(int i=0; i<rhs.nrows(); ++i)
 return r;
 }
 // I/O
 template <typename T>
 std::ostream& operator<<(std::ostream &s, const NRSMat<T> &x)
                {
 #ifdef CUDALA
 if(x.getlocation()==cpu)
   {
 #endif
                int i,j,n;
                n=x.nrows();
                s << n << ' ' << n << '\n';
@@ -566,12 +712,25 @@ std::ostream& operator<<(std::ostream &s, const NRSMat<T> &x)
                        for(j=0; j<n;j++) s << (typename LA_traits_io<T>::IOtype)x(i,j) << (j==n-1 ? '\n' : ' ');
                        }
                return s;
 #ifdef CUDALA
   }
 else
    {
    NRSMat<T> tmp=x;
    tmp.moveto(cpu);
    return s<<tmp;
    }
 #endif
                }
 template <typename T>
 std::istream& operator>>(std::istream  &s, NRSMat<T> &x)
                {
 #ifdef CUDALA
 if(x.getlocation()==cpu)
  {
 #endif
                int i,j,n,m;
                s >> n >> m;
                if(n!=m) laerror("input symmetric matrix not square");
@@ -579,6 +738,18 @@ std::istream& operator>>(std::istream  &s, NRSMat<T> &x)
 		typename LA_traits_io<T>::IOtype tmp;
                for(i=0;i<n;i++) for(j=0; j<m;j++) {s>>tmp; x(i,j)=tmp;}
                return s;
 #ifdef CUDALA
  }
 else
                {
                NRSMat<T> tmp;
                tmp.moveto(cpu);
                s >> tmp;
                tmp.moveto(x.getlocation());
                x=tmp;
                return s;
                }
 #endif
                }
@@ -24,6 +24,7 @@
 using namespace std;
 using namespace LA;
 extern void test(const NRVec<double> &x);
@@ -1551,7 +1552,7 @@ cout <<"Error = "<<(cx.transpose(true)*cx -bb).norm()<<endl;
 }
-if(1)
+if(0)
 {
 int n;
 cin >>n;
@@ -1574,5 +1575,59 @@ cout <<"Cholesky factor filling = "<<b.simplify()<<endl;
 }
 if(0)
 {
 int n;
 cin >>n;
 NRMat<double> a(n,n),b(n,n);
 a.randomize(1.);
 b.randomize(1.);
 NRMat<double>c;
 double t=clock()/((double) (CLOCKS_PER_SEC));
 int rep=1+10000000000LL/n/n/n;
 cout <<"Repetitions " <<rep<<endl;
 for(int i=0; i<rep; ++i) c=a*b;
 cout <<"CPU time (ms) "<<(clock()/((double) (CLOCKS_PER_SEC))-t)*1000./rep <<"\n";
 NRMat<double>cgpu;
 a.moveto(gpu1);
 b.moveto(gpu1);
 t=clock()/((double) (CLOCKS_PER_SEC));
 for(int i=0; i<rep; ++i) cgpu=a*b;
 cout <<"GPU time (ms) "<<(clock()/((double) (CLOCKS_PER_SEC))-t)*1000./rep <<"\n";
 cgpu.moveto(cpu);
 cout << "Error = "<<(c-cgpu).norm()<<endl;
 }
 if(1)
 {
 int n;
 cin >>n;
 NRMat<double> a(n,n);
 a.randomize(1.);
 NRVec<double> v(n);
 v.randomize(1.);
 NRSMat<double> as(n);
 as.randomize(1.);
 NRVec<double>w = a*v;
 NRVec<double>ws = as*v;
 NRMat<double>c(n,n);
 c=exp(a);
 a.moveto(gpu1);
 v.moveto(gpu1);
 as.moveto(gpu1);
 NRVec<double>wgpu = a*v;
 NRVec<double>wsgpu = as*v;
 w.moveto(gpu1);
 ws.moveto(gpu1);
 cout << "Error gemv = "<<(w-wgpu).norm()<<endl;
 cout << "Error symv = "<<(ws-wsgpu).norm()<<endl;
 NRMat<double>cgpu;
 cgpu=exp(a);
 c.moveto(gpu1);
 cout << "Error = "<<(c-cgpu).norm()<<endl;
 }
 }
@@ -84,6 +84,16 @@ NRVec<T>::NRVec(const NRMat<T> &rhs)
 template <typename T>
 void NRVec<T>::put(int fd, bool dim, bool transp) const
 {
 #ifdef CUDALA
 if(location!=cpu)
        {
        NRVec<T> tmp= *this;
        tmp.moveto(cpu);
        tmp.put(fd,dim,transp);
        return;
        }
 #endif
 errno=0;
 int pad=1; //align at least 8-byte
 if(dim)
@@ -94,9 +104,21 @@ if(sizeof(int) != write(fd,&pad,sizeof(int))) laerror("cannot write");
 LA_traits<T>::multiput(nn,fd,v,dim);
 }
 template <typename T>
 void NRVec<T>::get(int fd, bool dim, bool transp)
 {
 #ifdef CUDALA
 if(location!=cpu)
        {
        NRVec<T> tmp;
        tmp.moveto(cpu);
        tmp.get(fd,dim,transp);
        tmp.moveto(location);
        *this = tmp;
        return;
        }
 #endif
 int nn0[2]; //align at least 8-byte
 errno=0;
 if(dim)
@@ -319,9 +341,16 @@ void NRVec<double>::gemv(const double beta, const NRMat<double> &A,
 	if ((trans == 'n'?A.ncols():A.nrows()) != x.size())
 		laerror("incompatible sizes in gemv A*x");
 #endif
 	SAME_LOC3(*this,x,A);
 	copyonwrite();
-	cblas_dgemv(CblasRowMajor, (trans=='n' ? CblasNoTrans:CblasTrans),
+#ifdef CUDALA
-			A.nrows(), A.ncols(), alpha, A, A.ncols(), x.v, 1, beta, v, 1);
+	if(location==cpu)
 #endif
 		cblas_dgemv(CblasRowMajor, (trans=='n' ? CblasNoTrans:CblasTrans), A.nrows(), A.ncols(), alpha, A, A.ncols(), x.v, 1, beta, v, 1);
 #ifdef CUDALA
        else
 		cublasDgemv((trans=='n' ?'T':'N'),A.ncols(), A.nrows(),alpha, A, A.ncols(), x.v, 1, beta, v, 1);
 #endif
 }
@@ -362,8 +391,16 @@ void NRVec<double>::gemv(const double beta, const NRSMat<double> &A,
 #ifdef DEBUG
 	if (A.ncols()!=x.size()) laerror("incompatible dimension in gemv A*x");
 #endif
 SAME_LOC3(*this,A,x);
 	copyonwrite();
 #ifdef CUDALA
 if(location==cpu)
 #endif
 	cblas_dspmv(CblasRowMajor, CblasLower, A.ncols(), alpha, A, x.v, 1, beta, v, 1);
 #ifdef CUDALA
 else
 	cublasDspmv('U',A.ncols(), alpha, A, x.v, 1, beta, v, 1);
 #endif
 }
 template<>
@@ -420,6 +457,7 @@ NRVec<complex<double> >::otimes(const NRVec< complex<double> > &b, const bool co
 template<typename T>
 int NRVec<T>::sort(int direction, int from, int to, int *perm)
 {
 NOT_GPU(*this);
 copyonwrite();
 if(to == -1) to=nn-1;
 if(direction) return memqsort<1,NRVec<T>,int,int>(*this,perm,from,to);
@@ -427,6 +465,10 @@ else return memqsort<0,NRVec<T>,int,int>(*this,perm,from,to);
 }
 //////////////////////////////////////////////////////////////////////////////
 //// forced instantization in the corespoding object file
@@ -30,6 +30,9 @@ template <typename T> void lawritemat(FILE *file,const T *a,int r,int c,
 // Memory allocated constants for cblas routines
 const static complex<double> CONE = 1.0, CMONE = -1.0, CZERO = 0.0;
 #ifdef CUDALA
 const static cuDoubleComplex CUONE = {1.,0.}, CUMONE = {-1.,0.}, CUZERO = {0.,0.};
 #endif
 // Macros to construct binary operators +,-,*, from +=, -=, *=
 // for 3 cases: X + a, a + X, X + Y
@@ -44,7 +47,7 @@ template<class T> \
 #define NRVECMAT_OPER2(E,X) \
 template<class T> \
-	inline const NR##E<T> NR##E<T>::operator X(const NR##E<T> &a) const \
+inline const NR##E<T> NR##E<T>::operator X(const NR##E<T> &a) const \
 { return NR##E(*this) X##= a; }
@@ -55,12 +58,32 @@ protected:
 	int nn;
 	T *v;
 	int *count;
 #ifdef CUDALA
 	GPUID location;
 #endif
 public:
 	friend class NRSMat<T>;
 	friend class NRMat<T>;
-	inline NRVec(): nn(0),v(0),count(0){};
+	inline NRVec(): nn(0),v(0),count(0)
-	explicit inline NRVec(const int n) : nn(n), v(new T[n]), count(new int(1)) {};
+                        {
 #ifdef CUDALA
                        location = DEFAULT_LOC;
 #endif
                        };
 	explicit inline NRVec(const int n, const GPUID loc= undefined) : nn(n), count(new int(1)) 
                        {
 #ifdef CUDALA
                        if(loc==undefined) location = DEFAULT_LOC; else location = loc;
 			if(location==cpu)
 #endif
 				v= new T[n];
 #ifdef CUDALA
 			else
 				v= (T*) gpualloc(n*sizeof(T));
 #endif
                        };
 	inline NRVec(const T &a, const int n);
        inline NRVec(const T *a, const int n);
 	inline NRVec(T *a, const int n, bool skeleton);
@@ -71,6 +94,13 @@ public:
 	explicit NRVec(const NRMat<T> &rhs) : NRVec(&rhs[0][0],rhs.nrows()*rhs.ncols()) {};
 #else
 	explicit NRVec(const NRMat<T> &rhs);
 #endif
 #ifdef CUDALA
        inline GPUID getlocation() const {return location;}
        void moveto(const GPUID dest);
 #else
        inline GPUID getlocation() const {return cpu;}
        void moveto(const GPUID dest) {};
 #endif
 	NRVec & operator=(const NRVec &rhs);
 	NRVec & operator=(const T &a);  //assign a to every element
@@ -103,8 +133,8 @@ public:
 	void gemv(const T beta, const SparseMat<T> &a, const char trans, const T alpha, const NRVec &x,const bool treat_as_symmetric=false);
 	void gemv(const typename LA_traits_complex<T>::Component_type beta, const typename LA_traits_complex<T>::NRMat_Noncomplex_type &a, const char trans, const typename LA_traits_complex<T>::Component_type alpha, const NRVec &x);
 	void gemv(const typename LA_traits_complex<T>::Component_type beta, const typename LA_traits_complex<T>::NRSMat_Noncomplex_type &a, const char trans, const typename LA_traits_complex<T>::Component_type alpha, const NRVec &x);
-	const NRVec operator*(const NRMat<T> &mat) const {NRVec<T> result(mat.ncols()); result.gemv((T)0,mat,'t',(T)1,*this); return result;};
+	const NRVec operator*(const NRMat<T> &mat) const {NRVec<T> result(mat.ncols(),mat.getlocation()); result.gemv((T)0,mat,'t',(T)1,*this); return result;};
-	const NRVec operator*(const NRSMat<T> &mat) const {NRVec<T> result(mat.ncols()); result.gemv((T)0,mat,'t',(T)1,*this); return result;};
+	const NRVec operator*(const NRSMat<T> &mat) const {NRVec<T> result(mat.ncols(),mat.getlocation()); result.gemv((T)0,mat,'t',(T)1,*this); return result;};
 	const NRVec operator*(const SparseMat<T> &mat) const {NRVec<T> result(mat.ncols()); result.gemv((T)0,mat,'t',(T)1,*this); return result;};
 	const NRMat<T> otimes(const NRVec<T> &rhs, const bool conjugate=false, const T &scale=1) const; //outer product
 	inline const NRMat<T> operator|(const NRVec<T> &rhs) const {return otimes(rhs,true);};
@@ -150,29 +180,58 @@ public:
 #include "sparsemat.h"
 #include "sparsesmat.h"
 namespace LA {
 // formatted I/O
 template <typename T>
 std::ostream & operator<<(std::ostream &s, const NRVec<T> &x)
 {
 #ifdef CUDALA
 if(x.getlocation()==cpu)
   {
 #endif
  int i, n;
  n = x.size();
  s << n << std::endl;
  for(i=0; i<n; i++) s << (typename LA_traits_io<T>::IOtype)x[i] << (i == n-1 ? '\n' : ' ');
  return s;
 #ifdef CUDALA
   }
 else
    {
    NRVec<T> tmp=x;
    tmp.moveto(cpu);
    return s<<tmp;
    }
 #endif
 }
 template <typename T>
 std::istream & operator>>(std::istream &s, NRVec<T> &x)
 {
 #ifdef CUDALA
 if(x.getlocation()==cpu)
  {
 #endif
  int i,n;
  s >> n;
  x.resize(n);
  typename LA_traits_io<T>::IOtype tmp;
  for(i=0; i<n; i++) {s >> tmp; x[i]=tmp;}
  return s;
 #ifdef CUDALA
  }
 else
                {
                NRVec<T> tmp;
                tmp.moveto(cpu);
                s >> tmp;
                tmp.moveto(x.getlocation());
                x=tmp;
                return s;
                }
 #endif
 }
@@ -180,22 +239,51 @@ std::istream & operator>>(std::istream &s, NRVec<T> &x)
 // ctors
 template <typename T>
-inline NRVec<T>::NRVec(const T& a, const int n) : nn(n), v(new T[n]), count(new int)
+inline NRVec<T>::NRVec(const T& a, const int n) : nn(n), count(new int)
 {
 	*count = 1;
 #ifdef CUDALA
 	location=DEFAULT_LOC;
    if(location==cpu)
 	{
 #endif
 	v = new T[n];
 	if(a != (T)0)
 		for(int i=0; i<n; i++)
 			v[i] = a;
 	else
 		memset(v, 0, nn*sizeof(T));
 #ifdef CUDALA
 	}
    else
 	{
 	v= (T*) gpualloc(n*sizeof(T));	
 	cublasSetVector(n,sizeof(T),&a,0,v,1);
 	}
 #endif
 }
 template <typename T>
 inline NRVec<T>::NRVec(const T *a, const int n) : nn(n), count(new int)
 {
-		v=new T[n];
+#ifdef CUDALA
-		*count = 1;
+location=DEFAULT_LOC;
-		memcpy(v, a, n*sizeof(T));
+    if(location==cpu)
        {
 #endif
 	v=new T[n];
 	*count = 1;
 	memcpy(v, a, n*sizeof(T));
 #ifdef CUDALA
        }
    else
        {
        v= (T*) gpualloc(n*sizeof(T));
        cublasSetVector(n,sizeof(T),a,1,v,1);
        }
 #endif
 }
 template <typename T>
@@ -203,12 +291,28 @@ inline NRVec<T>::NRVec(T *a, const int n, bool skeleton) : nn(n), count(new int)
 {
 	if(!skeleton)
 		{
 #ifdef CUDALA
 location=DEFAULT_LOC;
    if(location==cpu)
        {
 #endif
 		v=new T[n];
 		*count = 1;
 		memcpy(v, a, n*sizeof(T));
 #ifdef CUDALA
        }
    else
        {
        v= (T*) gpualloc(n*sizeof(T));
        cublasSetVector(n,sizeof(T),a,1,v,1);
        }
 #endif
 		}
 	else
 		{
 #ifdef CUDALA
 		if(location!=cpu) laerror("NRVec() with skeleton option cannot be on GPU");
 #endif
 		*count = 2;
 		v=a;
 		}
@@ -217,6 +321,9 @@ inline NRVec<T>::NRVec(T *a, const int n, bool skeleton) : nn(n), count(new int)
 template <typename T>
 inline NRVec<T>::NRVec(const NRVec<T> &rhs)
 {
 #ifdef CUDALA
 	location=rhs.location;
 #endif
 	v = rhs.v;
 	nn = rhs.nn;
 	count = rhs.count;
@@ -226,6 +333,9 @@ inline NRVec<T>::NRVec(const NRVec<T> &rhs)
 template <typename T>
 inline NRVec<T>::NRVec(const NRSMat<T> &rhs)
 {
 #ifdef CUDALA
        location=rhs.location;
 #endif
 	nn = rhs.nn;
 	nn = NN2;
 	v = rhs.v;
@@ -233,28 +343,11 @@ inline NRVec<T>::NRVec(const NRSMat<T> &rhs)
 	(*count)++;
 }
-// x += a
+// x +/-= a
 template<>
 inline NRVec<double> & NRVec<double>::operator+=(const double &a)
 {
 	copyonwrite();
 	cblas_daxpy(nn, 1.0, &a, 0, v, 1);
 	return *this;
 }
 template<>
 inline NRVec< complex<double> > &
 NRVec< complex<double> >::operator+=(const complex<double> &a)
 {
 	copyonwrite();
 	cblas_zaxpy(nn, &CONE, &a, 0, v, 1);
 	return *this;
 }
 //and for general type
 template <typename T>
 inline NRVec<T> & NRVec<T>::operator+=(const T &a)
 {
 	NOT_GPU(*this);
        copyonwrite();
 	int i;
 	for(i=0; i<nn; ++i) v[i]+=a;
@@ -262,65 +355,26 @@ inline NRVec<T> & NRVec<T>::operator+=(const T &a)
 }
 // x -= a
 template<>
 inline NRVec<double> & NRVec<double>::operator-=(const double &a)
 {
 	copyonwrite();
 	cblas_daxpy(nn, -1.0, &a, 0, v, 1);
 	return *this;
 }
 template<>
 inline NRVec< complex<double> > &
 NRVec< complex<double> >::operator-=(const complex<double> &a)
 {
 	copyonwrite();
 	cblas_zaxpy(nn, &CMONE, &a, 0, v, 1);
 	return *this;
 }
 //and for general type
 template <typename T>
 inline NRVec<T> & NRVec<T>::operator-=(const T &a)
 {
 	NOT_GPU(*this);
        copyonwrite();
-        int i;
+	int i;
-        for(i=0; i<nn; ++i) v[i]-=a;
+	for(i=0; i<nn; ++i) v[i]-=a;
        return *this;
 }
 // x += x
 template<>
 inline NRVec<double> & NRVec<double>::operator+=(const NRVec<double> &rhs)
 {
 #ifdef DEBUG
 	if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
 #endif
 	copyonwrite();
 	cblas_daxpy(nn, 1.0, rhs.v, 1, v, 1);
 	return *this;
 }
 template<>
 inline NRVec< complex<double> > &
 NRVec< complex<double> >::operator+=(const NRVec< complex<double> > &rhs)
 {
 #ifdef DEBUG
 	if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
 #endif
 	copyonwrite();
 	cblas_zaxpy(nn, &CONE, rhs.v, 1, v, 1);
 	return *this;
 }
 //and for general type
 template <typename T>
 inline NRVec<T> & NRVec<T>::operator+=(const NRVec<T> &rhs)
 {
 #ifdef DEBUG
        if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
 NOT_GPU(*this);
 NOT_GPU(rhs);
 #endif
        copyonwrite();
 	int i;
@@ -346,6 +400,8 @@ inline NRVec<T> & NRVec<T>::operator/=(const NRVec<T> &rhs)
 {
 #ifdef DEBUG
        if (nn != rhs.nn) laerror("/= of incompatible vectors");
 NOT_GPU(*this);
 NOT_GPU(rhs);
 #endif
        copyonwrite();
        int i;
@@ -356,35 +412,13 @@ inline NRVec<T> & NRVec<T>::operator/=(const NRVec<T> &rhs)
 // x -= x
 template<>
 inline NRVec<double> & NRVec<double>::operator-=(const NRVec<double> &rhs)
 {
 #ifdef DEBUG
 	if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
 #endif
 	copyonwrite();
 	cblas_daxpy(nn, -1.0, rhs.v, 1, v, 1);
 	return *this;
 }
 template<>
 inline NRVec< complex<double> > &
 NRVec< complex<double> >::operator-=(const NRVec< complex<double> > &rhs)
 {
 #ifdef DEBUG
 	if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
 #endif
 	copyonwrite();
 	cblas_zaxpy(nn, &CMONE, rhs.v, 1, v, 1);
 	return *this;
 }
 //and for general type
 template <typename T>
 inline NRVec<T> & NRVec<T>::operator-=(const NRVec<T> &rhs)
 {
 #ifdef DEBUG
        if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
 NOT_GPU(*this);
 NOT_GPU(rhs);
 #endif
        copyonwrite();
        int i;
@@ -394,27 +428,10 @@ inline NRVec<T> & NRVec<T>::operator-=(const NRVec<T> &rhs)
 // x *= a
 template<>
 inline NRVec<double> & NRVec<double>::operator*=(const double &a)
 {
 	copyonwrite();
 	cblas_dscal(nn, a, v, 1);
 	return *this;
 }
 template<>
 inline NRVec< complex<double> > &
 NRVec< complex<double> >::operator*=(const complex<double> &a)
 {
 	copyonwrite();
 	cblas_zscal(nn, &a, v, 1);
 	return *this;
 }
 //and for general type
 template <typename T>
 inline NRVec<T> & NRVec<T>::operator*=(const T &a)
 {
 NOT_GPU(*this);
        copyonwrite();
        int i;
        for(i=0; i<nn; ++i) v[i]*=a;
@@ -423,33 +440,13 @@ inline NRVec<T> & NRVec<T>::operator*=(const T &a)
 // scalar product x.y
 template<>
 inline const double NRVec<double>::operator*(const NRVec<double> &rhs) const
 {
 #ifdef DEBUG
        if (nn != rhs.nn) laerror("dot of incompatible vectors");
 #endif
        return cblas_ddot(nn, v, 1, rhs.v, 1);
 }
 template<>
 inline const complex<double>
 NRVec< complex<double> >::operator*(const NRVec< complex<double> > &rhs) const
 {
 #ifdef DEBUG
 	if (nn != rhs.nn) laerror("dot of incompatible vectors");
 #endif
 	complex<double> dot;
 	cblas_zdotc_sub(nn, v, 1, rhs.v, 1, &dot);
 	return dot;
 }
 template<typename T>
 inline const T NRVec<T>::operator*(const NRVec<T> &rhs) const
 {
 #ifdef DEBUG
        if (nn != rhs.nn) laerror("dot of incompatible vectors");
 NOT_GPU(*this);
 NOT_GPU(rhs);
 #endif
 	T dot = 0;
 	for(int i=0; i<nn; ++i) dot+= v[i]*rhs.v[i];
@@ -458,28 +455,6 @@ inline const T NRVec<T>::operator*(const NRVec<T> &rhs) const
 // Sum of elements
 template<>
 inline const double NRVec<double>::asum() const
 {
 	return cblas_dasum(nn, v, 1);
 }
 // Dot product: x * y
 template<>
 inline const double NRVec<double>::dot(const double *y, const int stride) const
 {
 	return cblas_ddot(nn, y, stride, v, 1);
 }
 template<>
 inline const complex<double>
 NRVec< complex<double> >::dot(const complex<double> *y, const int stride) const
 {
 	complex<double> dot;
 	cblas_zdotc_sub(nn, y, stride, v, 1, &dot);
 	return dot;
 }
 // x[i] returns i-th element
 template <typename T>
@@ -489,6 +464,7 @@ inline T & NRVec<T>::operator[](const int i)
 	if(_LA_count_check && *count != 1) laerror("possible lval [] with count > 1");
 	if(i < 0 || i >= nn) laerror("NRVec out of range");
 	if(!v) laerror("[] on unallocated NRVec");
 NOT_GPU(*this);
 #endif
 	return v[i];
 }
@@ -498,6 +474,7 @@ inline const T & NRVec<T>::operator[](const int i) const
 #ifdef DEBUG
 	if(i < 0 || i >= nn) laerror("NRVec out of range");
 	if(!v) laerror("[] on unallocated NRVec");
 NOT_GPU(*this);
 #endif
 	return v[i];
 }
@@ -527,29 +504,6 @@ inline NRVec<T>::operator const T*() const
 	return v;
 }
 // return norm of the Vec
 template<>
 inline const double  NRVec<double>::norm() const
 {
 	return cblas_dnrm2(nn, v, 1);
 }
 template<>
 inline const double NRVec< complex<double> >::norm() const
 {
 	return cblas_dznrm2(nn, v, 1);
 }
 // Max element of the array
 template<>
 inline const double  NRVec<double>::amax() const
 {
 	return v[cblas_idamax(nn, v, 1)];
 }
 template<>
 inline const complex<double> NRVec< complex<double> >::amax() const
 {
 	return v[cblas_izamax(nn, v, 1)];
 }
 // Make Vec unitvector
@@ -576,7 +530,16 @@ NRVec<T>::~NRVec()
 {
        if(!count) return;
        if(--(*count) <= 0) {
-                if(v) delete[] (v);
+                if(v) 
 			{
 #ifdef CUDALA
                    if(location==cpu)
 #endif
 			delete[] (v);
 #ifdef CUDALA
 			else gpufree(v);
 #endif
 			}
                delete count;
        }
 }
@@ -591,12 +554,29 @@ void NRVec<T>::copyonwrite()
    (*count)--;
    count = new int;
    *count = 1;
-    T *newv = new T[nn];
+    T *newv;
-    memcpy(newv, v, nn*sizeof(T));
+#ifdef CUDALA
    if(location==cpu) 
 	{
 #endif
 	newv = new T[nn];
    	memcpy(newv, v, nn*sizeof(T));
 #ifdef CUDALA
        }
     else 
        {
         newv = (T *) gpualloc(nn*sizeof(T));
         if(sizeof(T)%sizeof(float)!=0) laerror("cpu memcpy alignment problem");
         cublasScopy(nn*sizeof(T)/sizeof(float),(const float *) v,1,(float *)newv,1);
        }
 #endif
    v = newv;
  }
 }
 // Asignment
 template <typename T>
 NRVec<T> & NRVec<T>::operator=(const NRVec<T> &rhs)
@@ -606,17 +586,29 @@ NRVec<T> & NRVec<T>::operator=(const NRVec<T> &rhs)
    if(count)
      if(--(*count) == 0)
      {
-        delete[] v;
+#ifdef CUDALA
        if(location==cpu)
 #endif
        	delete[] v;
 #ifdef CUDALA
 	else 
 		gpufree(v);
 #endif
        delete count;
      }
    v = rhs.v;
    nn = rhs.nn;
    count = rhs.count;
 #ifdef CUDALA
    location=rhs.location;
 #endif
    if(count) (*count)++;
  }
  return *this;
 }
 // Resize
 template <typename T>
 void NRVec<T>::resize(const int n)
@@ -629,7 +621,17 @@ void NRVec<T>::resize(const int n)
    if(n==0)
 	{
 	if(--(*count) <= 0) {
-                if(v) delete[] (v);
+                if(v) 
 			{
 #ifdef CUDALA
 			if(location==cpu)
 #endif
 				delete[] (v);
 #ifdef CUDALA
 			else
 				gpufree(v);
 #endif
 			}
                delete count;
        	}
 	count=0;
@@ -648,14 +650,33 @@ void NRVec<T>::resize(const int n)
    count = new int;
    *count = 1;
    nn = n;
-    v = new T[nn];
+#ifdef CUDALA
     if(location==cpu)
 #endif
    	v = new T[nn];
 #ifdef CUDALA
      else
        v = (T*) gpualloc(nn*sizeof(T));
 #endif
    return;
  }
  // *count = 1 in this branch
  if (n != nn) {
    nn = n;
-    delete[] v;
+#ifdef CUDALA
-    v = new T[nn];
+     if(location==cpu)
 #endif
  	{
    	delete[] v;
    	v = new T[nn];
 	}
 #ifdef CUDALA
      else
 	{
 	gpufree(v);
 	v = (T*) gpualloc(nn*sizeof(T));
 	}
 #endif
  }
 }
@@ -664,30 +685,18 @@ void NRVec<T>::resize(const int n)
 template <typename T>
 NRVec<T> & NRVec<T>::operator|=(const NRVec<T> &rhs)
 {
 	if (this != &rhs) {
 #ifdef DEBUG
 		if (!rhs.v) laerror("unallocated rhs in NRVec operator |=");
 #endif
-		if (count)
+        if (this == &rhs) return *this;
-			if (*count > 1) {
+        *this = rhs;
-				--(*count);
+        this->copyonwrite();
-				nn = 0;
+        return *this;
 				count = 0;
 				v = 0;
 			}
 		if (nn != rhs.nn) {
 			if (v) delete[] (v);
 			nn = rhs.nn;
 		}
 		if(!v) v = new T[nn];
 		if(!count) count = new int;
 		*count = 1;
 		memcpy(v, rhs.v, nn*sizeof(T));
 	}
 	return *this;
 }
 template<typename T>
 NRVec<complex<T> > complexify(const NRVec<T> &rhs)
 {
@@ -696,6 +705,291 @@ for(int i=0; i<rhs.size(); ++i)  r[i]=rhs[i];
 return r;
 }
 #ifdef CUDALA
 template<typename T>
 void NRVec<T>::moveto(const GPUID dest)
 {
 if(location==dest) return;
 location=dest;
 if(v && !count) laerror("internal inconsistency of reference counting 1");
 if (!count) return;
 if(v && *count==0) laerror("internal inconsistency of reference counting 2");
 if(!v) return;
 T *vold = v;
 if(dest == cpu) //moving from GPU to CPU
        {
        v = new T[nn];
        gpuget(nn,sizeof(T),vold,v);
        if(*count == 1) gpufree(vold);
        else {--(*count); count = new int(1);}
        }
 else    //moving from CPU to GPU
        {
        v=(T *) gpualloc(nn*sizeof(T));
        gpuput(nn,sizeof(T),vold,v);
        if(*count == 1) delete[] vold;
        else {--(*count); count = new int(1);}
        }
 }
 #endif
 //some template specializations leading to BLAS/CUBLAS calls
 template<>
 inline
 NRVec<double> & NRVec<double>::operator+=(const double &a)
 {
 	copyonwrite();
 #ifdef CUDALA
 	if(location==cpu)
 #endif
 		cblas_daxpy(nn, 1.0, &a, 0, v, 1);
 #ifdef CUDALA
 	else
 		{
 		double *d=gpuputdouble(a);
 		cublasDaxpy(nn, 1.0, d, 0, v, 1);
 		gpufree(d);
 		}
 #endif
 	return *this;
 }
 template<>
 inline
 NRVec< complex<double> > &
 NRVec< complex<double> >::operator+=(const complex<double> &a)
 {
 	copyonwrite();
 #ifdef CUDALA
        if(location==cpu)
 #endif
 		cblas_zaxpy(nn, &CONE, &a, 0, v, 1);
 #ifdef CUDALA
        else
                {
                complex<double> *d=gpuputcomplex(a);
                cublasZaxpy(nn, CUONE, (cuDoubleComplex *)d, 0, (cuDoubleComplex *)v, 1);
                gpufree(d);
                }
 #endif
 	return *this;
 }
 template<>
 inline
 NRVec<double> & NRVec<double>::operator-=(const double &a)
 {
 	copyonwrite();
 #ifdef CUDALA
 	if(location==cpu)
 #endif
 		cblas_daxpy(nn, -1.0, &a, 0, v, 1);
 #ifdef CUDALA
 	else
 		{
 		double *d=gpuputdouble(a);
 		cublasDaxpy(nn, -1.0, d, 0, v, 1);
 		gpufree(d);
 		}
 #endif
 	return *this;
 }
 template<>
 inline
 NRVec< complex<double> > &
 NRVec< complex<double> >::operator-=(const complex<double> &a)
 {
 	copyonwrite();
 #ifdef CUDALA
        if(location==cpu)
 #endif
 		cblas_zaxpy(nn, &CMONE, &a, 0, v, 1);
 #ifdef CUDALA
        else
                {
                complex<double> *d=gpuputcomplex(a);
                cublasZaxpy(nn, CUMONE, (cuDoubleComplex *)d, 0, (cuDoubleComplex *)v, 1);
                gpufree(d);
                }
 #endif
 	return *this;
 }
 template<>
 inline
 NRVec<double> & NRVec<double>::operator+=(const NRVec<double> &rhs)
 {
 #ifdef DEBUG
        if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
 #endif
        copyonwrite();
        cblas_daxpy(nn, 1.0, rhs.v, 1, v, 1);
        return *this;
 }
 template<>
 inline
 NRVec< complex<double> > &
 NRVec< complex<double> >::operator+=(const NRVec< complex<double> > &rhs)
 {
 #ifdef DEBUG
        if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
 #endif
        copyonwrite();
        cblas_zaxpy(nn, &CONE, rhs.v, 1, v, 1);
        return *this;
 }
 template<>
 inline
 NRVec<double> & NRVec<double>::operator-=(const NRVec<double> &rhs)
 {
 #ifdef DEBUG
        if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
 #endif
 SAME_LOC(*this,rhs);
        copyonwrite();
 #ifdef CUDALA
 	if(location==cpu)
 #endif
        	cblas_daxpy(nn, -1.0, rhs.v, 1, v, 1);
 #ifdef CUDALA
 	else
 		cublasDaxpy(nn, -1.0, rhs.v, 1, v, 1);
 #endif
        return *this;
 }
 template<>
 inline
 NRVec< complex<double> > &
 NRVec< complex<double> >::operator-=(const NRVec< complex<double> > &rhs)
 {
 #ifdef DEBUG
        if (nn != rhs.nn) laerror("daxpy of incompatible vectors");
 #endif
        copyonwrite();
        cblas_zaxpy(nn, &CMONE, rhs.v, 1, v, 1);
        return *this;
 }
 template<>
 inline
 NRVec<double> & NRVec<double>::operator*=(const double &a)
 {
        copyonwrite();
        cblas_dscal(nn, a, v, 1);
        return *this;
 }
 template<>
 inline
 NRVec< complex<double> > &
 NRVec< complex<double> >::operator*=(const complex<double> &a)
 {
        copyonwrite();
        cblas_zscal(nn, &a, v, 1);
        return *this;
 }
 template<>
 inline
 const double NRVec<double>::operator*(const NRVec<double> &rhs) const
 {
 #ifdef DEBUG
        if (nn != rhs.nn) laerror("dot of incompatible vectors");
 #endif
        return cblas_ddot(nn, v, 1, rhs.v, 1);
 }
 template<>
 inline
 const complex<double>
 NRVec< complex<double> >::operator*(const NRVec< complex<double> > &rhs) const
 {
 #ifdef DEBUG
        if (nn != rhs.nn) laerror("dot of incompatible vectors");
 #endif
        complex<double> dot;
        cblas_zdotc_sub(nn, v, 1, rhs.v, 1, &dot);
        return dot;
 }
 // Sum of elements
 template<>
 inline
 const double NRVec<double>::asum() const
 {
        return cblas_dasum(nn, v, 1);
 }
 // Dot product: x * y
 template<>
 inline
 const double NRVec<double>::dot(const double *y, const int stride) const
 {
        return cblas_ddot(nn, y, stride, v, 1);
 }
 template<>
 inline
 const complex<double>
 NRVec< complex<double> >::dot(const complex<double> *y, const int stride) const
 {
        complex<double> dot;
        cblas_zdotc_sub(nn, y, stride, v, 1, &dot);
        return dot;
 }
 // return norm of the Vec
 template<>
 inline
 const double  NRVec<double>::norm() const
 {
 #ifdef CUDALA
 	if(location!=cpu) return cublasDnrm2(nn, v, 1);
 #endif
        return cblas_dnrm2(nn, v, 1);
 }
 template<>
 inline
 const double NRVec< complex<double> >::norm() const
 {
        return cblas_dznrm2(nn, v, 1);
 }
 // Max element of the array
 template<>
 inline
 const double  NRVec<double>::amax() const
 {
        return v[cblas_idamax(nn, v, 1)];
 }
 /*
 cblas_izamax seems to be missing at least in some cblas versions
 template<>
 inline
 const complex<double> NRVec< complex<double> >::amax() const
 {
        return v[cblas_izamax(nn, v, 1)];
 }
 */
 }//namespace
 #endif /* _LA_VEC_H_ */