*** empty log message ***

2013-11-04 14:56:39 +00:00
parent a9e30620f0
commit 80fe44fab2
18 changed files with 505 additions and 308 deletions
--- a/31
+++ b/31
@@ -1,7 +1,33 @@
 04.11.2013 added conjugateme() for corder=1 in complex diagonalize() and
 gdiagonalize() to get correct eigenvector convention for C-storage
 30.10.2013 macros for consistent lowercase and uppercase of character
 parameters for case sensitive BLAS and LAPACK; case-insesitivness of these in vec.cc and mat.cc (cublas still not treated)
 29.10.2013 included la_traits.h in nonclass.c to get correct extern "C" of cblas
 14.10.2013 added operator% to bivector
 01.06.2012 more efficient clear() by detachonly parameter to copyonwrite()
 14.03.2012 fixed overflow in product of dimensions of NRMat and NRSMat
 13.03.2012 symmetry of integrals with different spins added to fourindex.h
 23.02.2012 included unistd.h 
 23.02.2012 fixed max read/write size in multiget and multiput to 1GB
 24.01.2012 Improved DIIS (L.V.)
 19.01.2012 Fixed location for gpu in nonclass.cc
 02.06.2011 In oplus() =0 replaced by clear() to work on non-square matrices (J.P.)
 01.02.2011 Added trace2 for complex matrices by L. Veis
 18.01.2011 Minor bugfixes and compatibility with Intel C++ compiler by Roman Curik
 28.12.2010 Generalized diagonalization and functions of general complex matrices 
 21.12.2010 Changed to size_t in matrix put,get to prevent overflow
 08.12.2010 Deallocate method added to LA_traits, used for memory saving in matrix exp
 24.11.2010 Added checking for integer overflow in exptimes
 23.10.2010 Fixed dependency on atlas for static libraries in configure.ac 
 27.09.2010 Seed of CSRMat class added.
 22.09.2010 Allowed formal operator[] on gpu matrices
 22.09.2010 Added submatrix() to SparseSMat
 22.09.2010 Zgerc and zgeru implemented in non-cblas version
 21.09.2010 Fixed a bug in laerror macro causing sometimes compilation problems
 08.09.2010 RELEASE 0.6
-08.09.2010 Doxygen documentation by M. Sulc
+08.09.2010 Doxygen documentation for  NRVec, NRMat, NRSMat classes contributed by M. Sulc
 08.09.2010 Extended CUBLAS support for NRVec, NRMat, NRSMat classes contributed by M. Sulc
-08.09.2010 Minor bugfixes contributed by M. Sulc
+08.09.2010 Minor bugfixes and improvements contributed by M. Sulc
 25.06.2010 Added proof-of-concept CUBLAS support for NRVec, NRMat, NRSMat 
 24.06.2010 Fixed a memory leak existing when MATPTR  was defined
 18.06.2010 added autoconf support for BLAS+LAPACK compiled with 64-bit integers and for CUBLAS
@@ -60,3 +86,4 @@ xx.08.2008 fixed wrong permutation symmetry in previously unused (and untested)
 11.03.2008 added cblas_idamax replacement for non-cblas
 05.03.2008 fixed transposed bug in inverse() with non-cblas
 26.02.2008 INITIAL RELEASE 0.1
--- a/bitvector.cc
+++ b/bitvector.cc
@@ -130,7 +130,7 @@ return *this;
 /*number of ones in a binary number, from "Hacker's delight" book*/
 #ifdef LONG_IS_32 
-static unsigned long word_popul(unsigned long x) 
+static unsigned int word_popul(unsigned long x) 
 {
 x -= ((x>>1)&0x55555555);
 x = (x&0x33333333) + ((x>>2)&0x33333333);
@@ -140,9 +140,10 @@ x+= (x>>16);
 return x&0x3f;
 }
 #else
-static unsigned long word_popul(unsigned long x)
+//@@@@ use an efficient trick
 static unsigned int word_popul(unsigned long x)
 {
-unsigned long s=0;
+unsigned int s=0;
 for(int i=0; i<64; ++i)
        {
        if(x&1) ++s;
@@ -156,7 +157,6 @@ return s;
 unsigned int bitvector::population(const unsigned int before) const
 {
 //@@@before
 int i;
 unsigned int s=0;
 for(i=0; i<nn-1; ++i) s+=word_popul(v[i]);
@@ -170,4 +170,21 @@ if(modulo)
 return s+word_popul(a);
 }
 unsigned int bitvector::operator%(const bitvector &y) const
 {
 if(nn!=y.nn) laerror("incompatible size in bitdifference");
 unsigned int s=0;
 for(int i=0; i<nn-1; ++i) s+=word_popul(v[i]^y.v[i]);
 bitvector_block a=v[nn-1]^y.v[nn-1];
 if(modulo)
        {
        bitvector_block mask= ~((bitvector_block)0);
        mask <<=modulo;
        a &= ~mask;
        }
 return s+word_popul(a);
 }
 }//namespace
--- a/bitvector.h
+++ b/bitvector.h
@@ -55,7 +55,7 @@ public:
 	void reset(const unsigned int i) {v[i/blockbits] &= ~(1<<(i%blockbits));};
 	const bool get(const unsigned int i) {return (v[i/blockbits] >>(i%blockbits))&1;};
 	const bool assign(const unsigned int i, const bool r) {if(r) set(i); else reset(i); return r;};
-	void clear() {copyonwrite(); memset(v,0,nn*sizeof(bitvector_block));};
+	void clear() {copyonwrite(true); memset(v,0,nn*sizeof(bitvector_block));};
 	void fill() {memset(v,0xff,nn*sizeof(bitvector_block));};
 	bool operator!=(const bitvector &rhs) const;
 	bool operator==(const bitvector &rhs) const {return !(*this != rhs);};
@@ -70,13 +70,13 @@ public:
 	bitvector operator&(const bitvector &rhs) const {return bitvector(*this) &= rhs;};
 	bitvector operator|(const bitvector &rhs) const {return bitvector(*this) |= rhs;};
 	bitvector operator^(const bitvector &rhs) const {return bitvector(*this) ^= rhs;};
-	unsigned int population(const unsigned int before=0) const; //@@@number of 1's
+        unsigned int operator%(const bitvector &y) const; //number of differing bits
 	unsigned int population(const unsigned int before=0) const; //number of 1's
 	//extended, truncated const i.e. not on *this but return new entity, take care of modulo's bits
 	//logical shifts  <<= >>= << >> not implemented yet
 	//logical rotations not implemented yet
 	};
 extern std::ostream & operator<<(std::ostream &s, const bitvector &x);
 extern std::istream & operator>>(std::istream  &s, bitvector &x);
--- a/csrmat.cc
+++ b/csrmat.cc
@@ -52,8 +52,9 @@ INSTANTIZE(complex<double>)
 */
 //// forced instantization of functions in the header in the corresponding object file
-template class CSRMat<double>;
+//@@@@@template class CSRMat<double>;
-template class CSRMat<complex<double> >;
+//@@@@template class CSRMat<complex<double> >;
 //@@@@ unfinished class commented out
--- a/fortran.h
+++ b/fortran.h
@@ -11,3 +11,6 @@
 #undef FORINT
 #define FINT int
 #endif
 #define BLAS_FORTRANCASE(x) toupper(x)
 #define LAPACK_FORTRANCASE(x) toupper(x)
--- a/fourindex.h
+++ b/fourindex.h
@@ -104,10 +104,10 @@ __attribute__((packed))
 //later add symmetry of complex integrals
-typedef enum {undefined_symmetry=-1,nosymmetry=0, twoelectronrealmullikan=1, twoelectronrealdirac=2, T2ijab_aces=3, antisymtwoelectronrealdirac=4, T2IjAb_aces=5} fourindexsymtype; //only permutation-nonequivalent elements are stored
+typedef enum {undefined_symmetry=-1,nosymmetry=0, twoelectronrealmullikan=1, twoelectronrealdirac=2, T2ijab_aces=3, antisymtwoelectronrealdirac=4, T2IjAb_aces=5, twoelectronrealmullikanAB=6 } fourindexsymtype; //only permutation-nonequivalent elements are stored
 // these should actually be static private members of the fourindex class, but leads to an ICE on gcc3.2
-static const int fourindex_n_symmetrytypes=6;
+static const int fourindex_n_symmetrytypes=7;
-static const int fourindex_permnumbers[fourindex_n_symmetrytypes]={1,8,8,4,8,8};
+static const int fourindex_permnumbers[fourindex_n_symmetrytypes]={1,8,8,4,8,8,4};
 static const int fourindex_permutations[fourindex_n_symmetrytypes][8][5]=
 		{
 		{{0,1,2,3,1}},
@@ -116,6 +116,7 @@ static const int fourindex_permutations[fourindex_n_symmetrytypes][8][5]=
 		{{0,1,2,3,1},{1,0,2,3,-1},{0,1,3,2,-1},{1,0,3,2,1}},
 		{{0,1,2,3,1},{1,0,2,3,-1},{0,1,3,2,-1},{1,0,3,2,1},{2,3,0,1,1},{3,2,0,1,-1},{2,3,1,0,-1},{3,2,1,0,1}},
 		{{0,1,2,3,1}}, //T2IjAb_aces is like nosymmetry but different index ranges
 		{{0,1,2,3,1},{1,0,2,3,1},{0,1,3,2,1},{1,0,3,2,1}},
 		};
@@ -669,6 +670,108 @@ typedef typename LA_traits<T>::normtype normtype;
 //make it as a derived class in order to be able to use it in a base class context - "supermatrix" operations
 template<class T, class I> 
 class fourindex_dense<twoelectronrealmullikanAB,T,I> : public NRMat<T> {
 public:
 	fourindex_dense(): NRMat<T>() {};
 	explicit fourindex_dense(const int n): NRMat<T>(n*(n+1)/2,n*(n+1)/2) {};
 	fourindex_dense(const NRMat<T> &rhs): NRMat<T>(rhs) {}; //be able to convert the parent class  transparently to this
 	fourindex_dense(const T &a, const int n): NRMat<T>(a,n*(n+1)/2,n*(n+1)/2) {};
 	fourindex_dense(const T *a, const int n): NRMat<T>(a,n*(n+1)/2,n*(n+1)/2) {};
 	//and also construct it from sparse and externally stored fourindex classes
 	//it seems not possible to nest template<class I> just for the two constructors
 	fourindex_dense(const fourindex<I,T> &rhs);
 	fourindex_dense(const fourindex_ext<I,T> &rhs);
 	T& operator() (unsigned int i, unsigned int j, unsigned int k, unsigned int l);
 	const T& operator() (unsigned int i, unsigned int j, unsigned int k, unsigned int l) const;
 	void resize(const int n) {(*this).NRMat<T>::resize(n*(n+1)/2,n*(n+1)/2);};
 	void putext(int f, T thr=1e-15);
 	int nbas() const {return (int)std::sqrt(2*(*this).nrows());};
 };
 template<class T, class I>
 void fourindex_dense<twoelectronrealmullikanAB,T,I>::putext(int f, T thr)
 {
 T y;
 for(int i=1; i<=nbas(); ++i) for(int j=1; j<=i; ++j)
        for(int k=1; k<=nbas(); ++k) for(int l=1; l<=k; ++l)
 		if((y=abs((*this)(i,j,k,l))) > thr)
 			{
 			matel4stored<I,T> x;
 			x.elem= y;
 			x.index.indiv.i=i;
 			x.index.indiv.j=j;
 			x.index.indiv.k=k;
 			x.index.indiv.l=l;
 			if(sizeof(matel4stored<I,T>) != write(f,&x,sizeof(matel4stored<I,T>)) )
 				laerror("write error in putext");
 			}
 }
 template<class T, class I> 
 fourindex_dense<twoelectronrealmullikanAB,T,I>::fourindex_dense(const fourindex<I,T> &rhs) : NRMat<T>((T)0,rhs.size()*(rhs.size()+1)/2,rhs.size()*(rhs.size()+1)/2)
 {
 if(rhs.getsymmetry() != twoelectronrealmullikanAB ) laerror("fourindex_dense symmetry mismatch");
 typename fourindex<I,T>::iterator p;
 #ifdef DEBUG
 unsigned int IJ = SMat_index_1(p->index.indiv.i,p->index.indiv.j);
 unsigned int KL = SMat_index_1(p->index.indiv.k,p->index.indiv.l);
 if (IJ<0 || IJ>=(unsigned int)NRMat<T>::nn || KL<0 || KL>=(unsigned int)NRMat<T>::mm) laerror("fourindex_dense index out of range in constructor");
 #endif
 for(p=rhs.begin(); p!= rhs.end(); ++p) (*this)(p->index.indiv.i,p->index.indiv.j,p->index.indiv.k,p->index.indiv.l) = p->elem;
 }
 template<class T, class I>
 fourindex_dense<twoelectronrealmullikanAB,T,I>::fourindex_dense(const fourindex_ext<I,T> &rhs) : NRMat<T>((T)0,rhs.size()*(rhs.size()+1)/2,rhs.size()*(rhs.size()+1)/2)
 {
 if(rhs.getsymmetry() != twoelectronrealmullikanAB ) laerror("fourindex_dense symmetry mismatch");
 typename fourindex_ext<I,T>::iterator p;
 for(p=rhs.begin(); p!= rhs.end(); ++p) 
 	{
 #ifdef DEBUG
 unsigned int IJ = SMat_index_1(p->index.indiv.i,p->index.indiv.j);
 unsigned int KL = SMat_index_1(p->index.indiv.k,p->index.indiv.l);
 if (IJ<0 || IJ>=(unsigned int)NRMat<T>::nn || KL<0 || KL>=(unsigned int)NRMat<T>::mm) laerror("fourindex_dense index out of range in constructor");
 #endif
 	(*this)(p->index.indiv.i,p->index.indiv.j ,p->index.indiv.k,p->index.indiv.l) = p->elem;
 	}
 }
 template<class T, class DUMMY>
 T& fourindex_dense<twoelectronrealmullikanAB,T,DUMMY>::operator() (unsigned int i, unsigned int j, unsigned int k, unsigned int l)
 {
 int I = SMat_index_1(i,j);
 int J = SMat_index_1(k,l);
 //I,J act as indices of a NRmat
 #ifdef DEBUG
     	if (*NRMat<T>::count != 1) laerror("lval (i,j,k,l) with count > 1 in fourindex_dense");
       	if (I<0 || I>=NRMat<T>::nn || J<0 || J>=NRMat<T>::mm) laerror("fourindex_dense index out of range");
       	if (!NRMat<T>::v) laerror("access to unallocated fourindex_dense");
 #endif
 return NRMat<T>::operator()(I,J);
 }
 template<class T, class DUMMY>
 const T& fourindex_dense<twoelectronrealmullikanAB,T,DUMMY>::operator() (unsigned int i, unsigned int j, unsigned int k, unsigned int l) const
 {
 int I = SMat_index_1(i,j);
 int J = SMat_index_1(k,l);
 //I,J act as indices of a NRSmat
 #ifdef DEBUG
       	if (I<0 || I>=NRMat<T>::nn || J<0 || J>=NRMat<T>::mm) laerror("fourindex_dense index out of range");
       	if (!NRMat<T>::v) laerror("access to unallocated fourindex_dense");
 #endif
 return NRMat<T>::operator()(I,J);
 }
 ////////////////////
 template<class T, class I> 
 class fourindex_dense<twoelectronrealmullikan,T,I> : public NRSMat<T> {
 public:
 	fourindex_dense(): NRSMat<T>() {};
--- a/la_traits.h
+++ b/la_traits.h
@@ -35,6 +35,7 @@
 #include <fstream>
 #include <limits>
 #include <complex>
 #include <unistd.h>
 //using namespace std;
@@ -56,7 +57,7 @@ extern "C" {
 #include "noncblas.h"
 #else
 extern "C" {
-#include "clapack.h"
+#include "atlas/clapack.h"
 }
 #endif
@@ -213,7 +214,7 @@ typedef C normtype;
 typedef C realtype;
 typedef complex<C> complextype;
 static inline C sqrabs(const complex<C> x) { return x.real()*x.real()+x.imag()*x.imag();}
-static inline bool gencmp(const complex<C> *x, const complex<C> *y, int n) {return memcmp(x,y,n*sizeof(complex<C>));}
+static inline bool gencmp(const complex<C> *x, const complex<C> *y, size_t n) {return memcmp(x,y,n*sizeof(complex<C>));}
 static bool bigger(const  complex<C> &x, const complex<C> &y) {laerror("complex comparison undefined"); return false;}
 static bool smaller(const  complex<C> &x, const complex<C> &y) {laerror("complex comparison undefined"); return false;}
 static inline normtype norm (const  complex<C> &x) {return std::abs(x);}
@@ -225,9 +226,10 @@ static void multiget(size_t n,int fd, complex<C> *x, bool dimensions=0)
 	size_t total=0;
 	size_t system_limit = (1L<<30)/sizeof(complex<C>); //do not expect too much from the system and read at most 1GB at once
 	ssize_t r;
 	size_t nn;
 	do{
-		r=read(fd,x+total,(n-total > system_limit ? system_limit : n-total)*sizeof(complex<C>)); 
+		r=read(fd,x+total,nn=(n-total > system_limit ? system_limit : n-total)*sizeof(complex<C>)); 
-		if(r<0 || r==0 && n!=0 ) {std::cout<<"read returned "<<r<<" perror "<<strerror(errno) <<std::endl; laerror("read error");}
+		if(r<0 || r==0 && nn!=0 ) {std::cout<<"read returned "<<r<<" perror "<<strerror(errno) <<std::endl; laerror("read error");}
 		else total += r/sizeof(complex<C>);
 		if(r%sizeof(complex<C>)) laerror("read error 2");
 	  }
@@ -238,16 +240,17 @@ static void multiput(size_t n, int fd, const complex<C> *x, bool dimensions=0)
 	size_t total=0;
 	size_t system_limit = (1L<<30)/sizeof(complex<C>); //do not expect too much from the system and write at most 1GB at once
 	ssize_t r;
 	size_t nn;
 	do{
-		r=write(fd,x+total,(n-total > system_limit ? system_limit : n-total)*sizeof(complex<C>)); 
+		r=write(fd,x+total,nn=(n-total > system_limit ? system_limit : n-total)*sizeof(complex<C>)); 
-		if(r<0 || r==0 && n!=0 ) {std::cout<<"write returned "<<r<<"  perror "<<strerror(errno) <<std::endl; laerror("write error");}
+		if(r<0 || r==0 && nn!=0 ) {std::cout<<"write returned "<<r<<"  perror "<<strerror(errno) <<std::endl; laerror("write error");}
 		else total += r/sizeof(complex<C>);
 		if(r%sizeof(complex<C>)) laerror("write error 2");
 	  }
 	while(total < n);
 	}
-static void copy(complex<C> *dest, complex<C> *src, unsigned int n) {memcpy(dest,src,n*sizeof(complex<C>));}
+static void copy(complex<C> *dest, complex<C> *src, size_t n) {memcpy(dest,src,n*sizeof(complex<C>));}
-static void clear(complex<C> *dest, unsigned int n) {memset(dest,0,n*sizeof(complex<C>));}
+static void clear(complex<C> *dest, size_t n) {memset(dest,0,n*sizeof(complex<C>));}
 static void copyonwrite(complex<C> &x) {};
 static void clearme(complex<C> &x) {x=0;};
 static void deallocate(complex<C> &x) {};
@@ -266,7 +269,7 @@ typedef C normtype;
 typedef C realtype;
 typedef complex<C> complextype;
 static inline C sqrabs(const C x) { return x*x;}
-static inline bool gencmp(const C *x, const C *y, int n) {return memcmp(x,y,n*sizeof(C));}
+static inline bool gencmp(const C *x, const C *y, size_t n) {return memcmp(x,y,n*sizeof(C));}
 static inline bool bigger(const  C &x, const C &y) {return x>y;}
 static inline bool smaller(const  C &x, const C &y) {return x<y;}
 static inline normtype norm (const  C &x) {return std::abs(x);}
@@ -278,9 +281,10 @@ static void multiget(size_t n,int fd, C *x, bool dimensions=0)
 	size_t total=0;
 	size_t system_limit = (1L<<30)/sizeof(C); //do not expect too much from the system and read at most 1GB at once
 	ssize_t r;
 	size_t nn;
 	do{
-		r=read(fd,x+total,(n-total > system_limit ? system_limit : n-total)*sizeof(C)); 
+		r=read(fd,x+total,nn=(n-total > system_limit ? system_limit : n-total)*sizeof(C)); 
-		if(r<0 || r==0 && n!=0 ) {std::cout<<"read returned "<<r<<std::endl; laerror("read error");}
+		if(r<0 || r==0 && nn!=0 ) {std::cout<<"read returned "<<r<<" perror "<<strerror(errno) <<std::endl; laerror("read error");}
 		else total += r/sizeof(C);
 		if(r%sizeof(C)) laerror("read error 2");
 	  }
@@ -291,16 +295,17 @@ static void multiput(size_t n, int fd, const C *x, bool dimensions=0)
 	size_t total=0;
 	size_t system_limit = (1L<<30)/sizeof(C); //do not expect too much from the system and write at most 1GB at once
 	ssize_t r;
 	size_t nn;
 	do{
-		r=write(fd,x+total,(n-total > system_limit ? system_limit : n-total)*sizeof(C)); 
+		r=write(fd,x+total,nn=(n-total > system_limit ? system_limit : n-total)*sizeof(C)); 
-		if(r<0 || r==0 && n!=0 ) {std::cout<<"write returned "<<r<<std::endl; laerror("write error");}
+		if(r<0 || r==0 && nn!=0 ) {std::cout<<"write returned "<<r<<" perror "<<strerror(errno) <<std::endl; laerror("write error");}
 		else total += r/sizeof(C);
 		if(r%sizeof(C)) laerror("write error 2");
 	  }
 	while(total < n);
 	}
-static void copy(C *dest, C *src, unsigned int n) {memcpy(dest,src,n*sizeof(C));}
+static void copy(C *dest, C *src, size_t n) {memcpy(dest,src,n*sizeof(C));}
-static void clear(C *dest, unsigned int n) {memset(dest,0,n*sizeof(C));}
+static void clear(C *dest, size_t n) {memset(dest,0,n*sizeof(C));}
 static void copyonwrite(C &x) {};
 static void clearme(C &x) {x=0;};
 static void deallocate(C &x) {};
@@ -323,7 +328,7 @@ typedef X<C> producttype; \
 typedef typename LA_traits<C>::normtype normtype; \
 typedef X<typename LA_traits<C>::realtype> realtype; \
 typedef X<typename LA_traits<C>::complextype> complextype; \
-static bool gencmp(const C *x, const C *y, int n) {for(int i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \
+static bool gencmp(const C *x, const C *y, size_t n) {for(size_t i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \
 static inline bool bigger(const  C &x, const C &y) {return x>y;} \
 static inline bool smaller(const  C &x, const C &y) {return x<y;} \
 static inline normtype norm (const X<C> &x) {return x.norm();} \
@@ -332,8 +337,8 @@ static void put(int fd, const X<C> &x, bool dimensions=1, bool transp=0) {x.put(
 static void get(int fd, X<C> &x, bool dimensions=1, bool transp=0) {x.get(fd,dimensions,transp);} \
 static void multiput(size_t n,int fd, const X<C> *x, bool dimensions=1) {for(size_t i=0; i<n; ++i) x[i].put(fd,dimensions);} \
 static void multiget(size_t n,int fd, X<C> *x, bool dimensions=1) {for(size_t i=0; i<n; ++i) x[i].get(fd,dimensions);} \
-static void copy(C *dest, C *src, unsigned int n) {for(unsigned int i=0; i<n; ++i) dest[i]=src[i];} \
+static void copy(C *dest, C *src, size_t n) {for(size_t i=0; i<n; ++i) dest[i]=src[i];} \
-static void clear(C *dest, unsigned int n) {for(unsigned int i=0; i<n; ++i) dest[i].clear();}\
+static void clear(C *dest, size_t n) {for(size_t i=0; i<n; ++i) dest[i].clear();}\
 static void copyonwrite(X<C> &x) {x.copyonwrite();}\
 static void clearme(X<C> &x) {x.clear();}\
 static void deallocate(X<C> &x) {x.dealloc();}\
@@ -359,7 +364,7 @@ typedef NRMat<C> producttype;  \
 typedef typename LA_traits<C>::normtype normtype;  \
 typedef X<typename LA_traits<C>::realtype> realtype; \
 typedef X<typename LA_traits<C>::complextype> complextype; \
-static bool gencmp(const C *x, const C *y, int n) {for(int i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \
+static bool gencmp(const C *x, const C *y, size_t n) {for(size_t i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \
 static inline bool bigger(const  C &x, const C &y) {return x>y;} \
 static inline bool smaller(const  C &x, const C &y) {return x<y;} \
 static inline normtype norm (const X<C> &x) {return x.norm();}  \
@@ -368,8 +373,8 @@ static void put(int fd, const X<C> &x, bool dimensions=1, bool transp=0) {x.put(
 static void get(int fd, X<C> &x, bool dimensions=1, bool transp=0) {x.get(fd,dimensions);}  \
 static void multiput(size_t n,int fd, const X<C> *x, bool dimensions=1) {for(size_t i=0; i<n; ++i) x[i].put(fd,dimensions);}  \
 static void multiget(size_t n,int fd, X<C> *x, bool dimensions=1) {for(size_t i=0; i<n; ++i) x[i].get(fd,dimensions);}  \
-static void copy(C *dest, C *src, unsigned int n) {for(unsigned int i=0; i<n; ++i) dest[i]=src[i];}  \
+static void copy(C *dest, C *src, size_t n) {for(size_t i=0; i<n; ++i) dest[i]=src[i];}  \
-static void clear(C *dest, unsigned int n) {for(unsigned int i=0; i<n; ++i) dest[i].clear();} \
+static void clear(C *dest, size_t n) {for(size_t i=0; i<n; ++i) dest[i].clear();} \
 static void copyonwrite(X<C> &x) {x.copyonwrite();} \
 static void clearme(X<C> &x) {x.clear();} \
 static void deallocate(X<C> &x) {x.dealloc();} \
--- a/laerror.cc
+++ b/laerror.cc
@@ -87,7 +87,26 @@ extern "C" void ATL_xerbla(int p, char *rout, char *form, ...){
 	laerror(msg0);
 }
-int cblas_errprn(int ierr, int info, char *form, ...) {
+#ifndef NONCBLAS
 #include "cblas.h"
 #include <stdarg.h>
 extern "C" void cblas_xerbla(int p, const char *rout, const char *form, ...)
 {
   va_list argptr;
   va_start(argptr, form);
   if (p)
      {
      fprintf(stdout, "Parameter %d to routine %s was incorrect\n", p, rout);
      fprintf(stderr, "Parameter %d to routine %s was incorrect\n", p, rout);
      }
   vfprintf(stdout, form, argptr);
   vfprintf(stderr, form, argptr);
   va_end(argptr);
   laerror("terminating in cblas_xerbla");
 }
 extern "C" int cblas_errprn(int ierr, int info, char *form, ...) {
 	char msg0[1024], *msg;
 	va_list argptr;
 	va_start(argptr, form);
@@ -98,5 +117,6 @@ int cblas_errprn(int ierr, int info, char *form, ...) {
 	laerror(msg0);
 	return 0;
 }
 #endif
 }//namespace
--- a/mat.cc
+++ b/mat.cc
@@ -26,11 +26,8 @@
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <errno.h>
 #include <unistd.h>
 extern "C" {
 	extern ssize_t read(int, void *, size_t);
 	extern ssize_t write(int, const void *, size_t);
 }
 namespace LA {
@@ -77,14 +74,14 @@ const NRMat<T> NRMat<T>::otimes(const NRMat<T> &rhs, bool reversecolumns) const
 		{
 			T c = (*this)(i,j);
 			for(k=0;k<rhs.nn;k++) for(l=0;l<rhs.mm;l++)
-				r( i*rhs.nn + k, l*mm + j ) = c*rhs(k,l);
+				r( i*(size_t)rhs.nn + k, l*mm + j ) = c*rhs(k,l);
 		}
 	}else{
 		for(i=0;i<nn;i++) for(j=0;j<mm;j++)
 		{
 			T c=(*this)(i,j);
 			for(k=0;k<rhs.nn;k++) for(l=0;l<rhs.mm;l++)
-				r( i*rhs.nn+k, j*rhs.mm+l ) = c *rhs(k,l);
+				r( i*(size_t)rhs.nn+k, j*(size_t)rhs.mm+l ) = c *rhs(k,l);
 		}
 	}
@@ -108,7 +105,7 @@ const NRVec<T> NRMat<T>::row(const int i, int l) const {
 #ifdef MATPTR
 	        v[i]
 #else
-		v + i*l
+		v + i*(size_t)l
 #endif
 		, l);
 	return r;
@@ -144,7 +141,7 @@ void NRMat<T>::put(int fd, bool dim, bool transp) const {
 				#ifdef MATPTR
 	                                v[i][j]
 				#else
-	                                v[i*mm+j]
+	                                v[i*(size_t)mm+j]
 				#endif
 					,dim ,transp);
 			}
@@ -196,7 +193,7 @@ void NRMat<T>::get(int fd, bool dim, bool transp){
 				#ifdef MATPTR
 					v[i][j]
 				#else
-					v[i*mm+j]
+					v[i*(size_t)mm+j]
 				#endif
 				,dim,transp);
 			}
@@ -476,13 +473,13 @@ NRMat<T> & NRMat<T>::operator-=(const T &a) {
 ******************************************************************************/
 template <>
 const NRMat<double> NRMat<double>::operator-() const {
-	const int nm = nn*mm;
+	const size_t nm = (size_t)nn*mm;
        NRMat<double> result(nn, mm, getlocation());
 #ifdef CUDALA
        if(location == cpu) {
 #endif
 #ifdef MATPTR
-	        for(register int i=0; i<nm; i++) result.v[0][i] = -v[0][i];
+	        for(register size_t i=0; i<nm; i++) result.v[0][i] = -v[0][i];
 #else
 		memcpy(result.v, v, nm*sizeof(double));
 		cblas_dscal(nm, -1., result.v, 1);
@@ -506,13 +503,13 @@ const NRMat<double> NRMat<double>::operator-() const {
 ******************************************************************************/
 template <>
 const NRMat<complex<double> > NRMat<complex<double> >::operator-() const {
-	const int nm = nn*mm;
+	const size_t nm = (size_t)nn*mm;
        NRMat<complex<double> > result(nn, mm, getlocation());
 #ifdef CUDALA
        if(location == cpu) {
 #endif
 #ifdef MATPTR
-	        for(register int i=0; i<nm; i++) result.v[0][i]= -v[0][i];
+	        for(register size_t i=0; i<nm; i++) result.v[0][i]= -v[0][i];
 #else
 		memcpy(result.v, v, nm*sizeof(complex<double>));
 		cblas_zscal(nm, &CMONE, result.v, 1);
@@ -539,9 +536,9 @@ const NRMat<T> NRMat<T>::operator-() const {
 	NRMat<T> result(nn, mm, getlocation());
 #ifdef MATPTR
-	for(register int i=0; i<nn*mm; i++) result.v[0][i] = -v[0][i];
+	for(register size_t i=0; i<(size_t)nn*mm; i++) result.v[0][i] = -v[0][i];
 #else
-	for(register int i=0; i<nn*mm; i++) result.v[i] = -v[i];
+	for(register size_t i=0; i<(size_t)nn*mm; i++) result.v[i] = -v[i];
 #endif
 	return result;
 }
@@ -562,11 +559,11 @@ const NRMat<T> NRMat<T>::operator&(const NRMat<T> &b) const {
 		if(sizeof(T)%sizeof(float) != 0) laerror("memory alignment problem");
 		for(register int i=0; i<nn; i++){
-			cublasScopy(mm*sizeof(T)/sizeof(float), (float*)(v + i*mm), 1, (float*)(result.v + i*(mm + b.mm)), 1);
+			cublasScopy(mm*sizeof(T)/sizeof(float), (float*)(v + i*(size_t)mm), 1, (float*)(result.v + i*(size_t)(mm + b.mm)), 1);
 			TEST_CUBLAS("cublasScopy");
 		}
 		for(register int i=0; i<b.nn; i++){
-			cublasScopy(mm*sizeof(T)/sizeof(float), (float*)(b.v + i*b.mm), 1, (float*)(result.v + (nn + i)*(mm + b.mm)), 1);
+			cublasScopy(mm*sizeof(T)/sizeof(float), (float*)(b.v + i*(size_t)b.mm), 1, (float*)(result.v + (nn + i)*(mm + b.mm)), 1);
 			TEST_CUBLAS("cublasScopy");
 		}
 	}
@@ -582,7 +579,7 @@ const NRMat<T> NRMat<T>::operator|(const NRMat<T> &b) const {
 		for (int j=0; j<mm; j++)
 			for (int k=0; k<b.nn; k++)
 				for (int l=0; l<b.mm; l++)
-					result[i*b.nn+k][j*b.mm+l] = (*this)[i][j]*b[k][l];
+					result[i*(size_t)b.nn+k][j*(size_t)b.mm+l] = (*this)[i][j]*b[k][l];
 	return result;
 }
@@ -689,7 +686,7 @@ const NRVec<double> NRMat<double>::rsum() const {
 #ifdef CUDALA
 	}else{
 		for(register int i=0;i<nn;i++){
-			cublasDaxpy(mm, 1.0, v + i*mm, 1, result.v, 1);
+			cublasDaxpy(mm, 1.0, v + i*(size_t)mm, 1, result.v, 1);
 			TEST_CUBLAS("cublasDaxpy");
 		}
 	}
@@ -714,7 +711,7 @@ const NRVec<complex<double> > NRMat<complex<double> >::rsum() const {
 #ifdef CUDALA
 	}else{
 		for(register int i=0;i<nn;i++){
-			cublasZaxpy(mm, CUONE, (cuDoubleComplex*)(v + i*mm), 1, (cuDoubleComplex*)(result.v), 1);
+			cublasZaxpy(mm, CUONE, (cuDoubleComplex*)(v + i*(size_t)mm), 1, (cuDoubleComplex*)(result.v), 1);
 			TEST_CUBLAS("cublasZaxpy");
 		}
 	}
@@ -748,14 +745,14 @@ const NRMat<T> NRMat<T>::submatrix(const int fromrow, const int torow, const int
 			#ifdef MATPTR
 			memcpy(r.v[i - fromrow], v[i] + fromcol, m*sizeof(T));
 			#else
-			memcpy(r.v+(i - fromrow)*m, v + i*mm + fromcol, m*sizeof(T));
+			memcpy(r.v+(i - fromrow)*m, v + i*(size_t)mm + fromcol, m*sizeof(T));
 			#endif
 		}
 #ifdef CUDALA
 	}else{
 		if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem");
 		for(register int i=fromrow; i<=torow; ++i){
-			cublasScopy(m*sizeof(T)/sizeof(float), (const float *)(v + i*mm + fromcol), 1, (float*)(r.v + (i - fromrow)*m), 1);
+			cublasScopy(m*sizeof(T)/sizeof(float), (const float *)(v + i*(size_t)mm + fromcol), 1, (float*)(r.v + (i - fromrow)*m), 1);
 			TEST_CUBLAS("cublasScopy");
 		}
 	}
@@ -786,13 +783,13 @@ void NRMat<T>::storesubmatrix(const int fromrow, const int fromcol, const NRMat
 		#ifdef MATPTR
 			memcpy(v[i] + fromcol, rhs.v[i - fromrow], m*sizeof(T));
 		#else
-			memcpy(v + i*mm + fromcol, rhs.v + (i - fromrow)*m, m*sizeof(T));
+			memcpy(v + i*(size_t)mm + fromcol, rhs.v + (i - fromrow)*m, m*sizeof(T));
 		#endif
 	#ifdef CUDALA
 		}else{
 			if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem");
-			cublasScopy(m*sizeof(T)/sizeof(float), (const float *) (rhs.v + (i - fromrow)*m), 1, (float *)(v + i*mm + fromcol), 1);
+			cublasScopy(m*sizeof(T)/sizeof(float), (const float *) (rhs.v + (i - fromrow)*m), 1, (float *)(v + i*(size_t)mm + fromcol), 1);
 		}
 	#endif
 	}
@@ -821,8 +818,8 @@ NRMat<T>& NRMat<T>::transposeme(const int _n) {
 				v[j][i] = tmp;
 			#else
 				register int a, b; 
-				a = i*mm + j;
+				a = i*(size_t)mm + j;
-				b = j*mm + i;
+				b = j*(size_t)mm + i;
 				T tmp = v[a];
 				v[a] = v[b];
 				v[b] = tmp;
@@ -847,7 +844,7 @@ NRMat<T>& NRMat<T>::transposeme(const int _n) {
 ******************************************************************************/
 template<>
 NRMat<complex<double> >::NRMat(const NRMat<double> &rhs, bool imagpart): nn(rhs.nrows()), mm(rhs.ncols()), count(new int(1)) {
-	const int nn_mm = nn*mm;
+	const size_t nn_mm = (size_t)nn*mm;
 #ifdef CUDALA
 	if(location == cpu){
 #endif
@@ -888,7 +885,7 @@ NRMat<complex<double> >::NRMat(const NRMat<double> &rhs, bool imagpart): nn(rhs.
 ******************************************************************************/
 template<>
 NRMat<double>::NRMat(const NRMat<complex<double> > &rhs, bool imagpart): nn(rhs.nrows()), mm(rhs.ncols()), count(new int(1)) {
-	const int nn_mm = nn*mm;
+	const size_t nn_mm = (size_t) nn*mm;
 #ifdef CUDALA
 	if(location == cpu){
 #endif
@@ -1079,7 +1076,7 @@ const NRSMat<double>  NRMat<double>::timestransposed() const {
 			#ifdef MATPTR
 				r(i, j) = cblas_ddot(mm, v[i], 1, v[j], 1);
 			#else
-				r(i, j) = cblas_ddot(mm, v + i*mm, 1, v + j*mm, 1);
+				r(i, j) = cblas_ddot(mm, v + i*(size_t)mm, 1, v + j*(size_t)mm, 1);
 			#endif
 			}
 		}
@@ -1087,7 +1084,7 @@ const NRSMat<double>  NRMat<double>::timestransposed() const {
 	}else{
 		for(i=0; i<nn; ++i){
 			for(j=0; j<=i; ++j){
-				r(i, j) = cublasDdot(nn, v + i*mm, 1, v + j*mm, 1);
+				r(i, j) = cublasDdot(nn, v + i*(size_t)mm, 1, v + j*(size_t)mm, 1);
 				TEST_CUBLAS("cublasDdot");
 			}
 		}
@@ -1113,7 +1110,7 @@ const NRSMat<complex<double> >  NRMat<complex<double> >::timestransposed() const
 			#ifdef MATPTR
 				cblas_zdotc_sub(nn, v[i], 1, v[j], 1, &r(i,j));
 			#else
-				cblas_zdotc_sub(nn, v + i*mm, 1, v + j*mm, 1, &r(i,j));
+				cblas_zdotc_sub(nn, v + i*(size_t)mm, 1, v + j*(size_t)mm, 1, &r(i,j));
 			#endif
 			}
 		}
@@ -1121,7 +1118,7 @@ const NRSMat<complex<double> >  NRMat<complex<double> >::timestransposed() const
 	}else{
 		for(i=0; i<mm; ++i){
 			for(j=0; j<=i; ++j){
-				cuDoubleComplex val = cublasZdotc(nn, (const cuDoubleComplex *)(v + i*mm), 1, (const cuDoubleComplex *)(v + j*mm), 1);
+				cuDoubleComplex val = cublasZdotc(nn, (const cuDoubleComplex *)(v + i*(size_t)mm), 1, (const cuDoubleComplex *)(v + j*(size_t)mm), 1);
 				TEST_CUBLAS("cublasZdotc");
 				r(i, j) = *(reinterpret_cast<complex<double>*> (&val));
 			}
@@ -1172,7 +1169,7 @@ void NRMat<double>::randomize(const double &x) {
 	}else{
 		NRMat<double> tmp(nn, mm, cpu);
 		double *tmp_data = tmp;
-		for(register int i=0; i<nn*mm; ++i){
+		for(register size_t i=0; i<(size_t)nn*mm; ++i){
 			tmp_data[i] = x*(2.*random()/(1. + RAND_MAX) - 1.);
 		}
 		tmp.moveto(this->location);
@@ -1203,7 +1200,7 @@ void NRMat<complex<double> >::randomize(const double &x) {
 	}else{
 		NRMat<complex<double> > tmp(nn, mm, cpu);
 		complex<double> *tmp_data = tmp;
-		for(register int i=0; i<nn*mm; ++i){
+		for(register size_t i=0; i<(size_t)nn*mm; ++i){
 			const double re = x*(2.*random()/(1. + RAND_MAX) - 1.);
 			const double im = x*(2.*random()/(1. + RAND_MAX) - 1.);
 			tmp_data[i] = complex<double>(re, im);
@@ -1226,10 +1223,10 @@ NRMat<double>& NRMat<double>::operator*=(const double &a) {
 #ifdef CUDALA
 	if(location == cpu){
 #endif
-		cblas_dscal(nn*mm, a, *this, 1);
+		cblas_dscal((size_t)nn*mm, a, *this, 1);
 #ifdef CUDALA
 	}else{
-		cublasDscal(nn*mm, a, v, 1);
+		cublasDscal((size_t)nn*mm, a, v, 1);
 		TEST_CUBLAS("cublasDscal");
 	}
 #endif
@@ -1249,11 +1246,11 @@ NRMat<complex<double> >::operator*=(const complex<double> &a) {
 #ifdef CUDALA
 	if(location == cpu){
 #endif
-		cblas_zscal(nn*mm, &a, (*this)[0], 1);
+		cblas_zscal((size_t)nn*mm, &a, (*this)[0], 1);
 #ifdef CUDALA
 	}else{
 		const cuDoubleComplex fac = *(reinterpret_cast<const cuDoubleComplex*> (&a));
-		cublasZscal(nn*mm, fac, (cuDoubleComplex *)v, 1);
+		cublasZscal((size_t)nn*mm, fac, (cuDoubleComplex *)v, 1);
 		TEST_CUBLAS("cublasZscal");
 	}
 #endif
@@ -1271,9 +1268,9 @@ NRMat<T> & NRMat<T>::operator*=(const T &a) {
 	NOT_GPU(*this);
        copyonwrite();
 #ifdef MATPTR
-         for(register int i=0; i< nn*mm; i++) v[0][i] *= a;
+         for(register size_t i=0; i< (size_t)nn*mm; i++) v[0][i] *= a;
 #else
-         for(register int i=0; i< nn*mm; i++) v[i] *= a;
+         for(register size_t i=0; i< (size_t)nn*mm; i++) v[i] *= a;
 #endif
        return *this;
 }
@@ -1294,10 +1291,10 @@ NRMat<double> & NRMat<double>::operator+=(const NRMat<double>  &rhs) {
 #ifdef CUDALA
 	if(location == cpu){
 #endif
-		cblas_daxpy(nn*mm, 1.0, rhs, 1, *this, 1);
+		cblas_daxpy((size_t)nn*mm, 1.0, rhs, 1, *this, 1);
 #ifdef CUDALA
 	}else{
-		cublasDaxpy(nn*mm, 1.0, rhs, 1, v, 1);
+		cublasDaxpy((size_t)nn*mm, 1.0, rhs, 1, v, 1);
 		TEST_CUBLAS("cublasDaxpy");
 	}
 #endif
@@ -1320,10 +1317,10 @@ NRMat<complex<double> >::operator+=(const NRMat< complex<double> >  &rhs) {
 #ifdef CUDALA
 	if(location == cpu){
 #endif
-		cblas_zaxpy(nn*mm, &CONE, rhs[0], 1, (*this)[0], 1);
+		cblas_zaxpy((size_t)nn*mm, &CONE, rhs[0], 1, (*this)[0], 1);
 #ifdef CUDALA
 	}else{
-		cublasZaxpy(nn*mm, CUONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1);
+		cublasZaxpy((size_t)nn*mm, CUONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1);
 	}
 #endif
 	return *this;
@@ -1345,9 +1342,9 @@ NRMat<T> & NRMat<T>::operator+=(const NRMat<T>  &rhs) {
 	copyonwrite();
 	#ifdef MATPTR
-		for(int i=0; i< nn*mm; i++) v[0][i] += rhs.v[0][i];
+		for(size_t i=0; i< (size_t)nn*mm; i++) v[0][i] += rhs.v[0][i];
 	#else
-		for(int i=0; i< nn*mm; i++) v[i] += rhs.v[i];
+		for(size_t i=0; i< (size_t)nn*mm; i++) v[i] += rhs.v[i];
 	#endif
        return *this;
 }
@@ -1368,10 +1365,10 @@ NRMat<double> & NRMat<double>::operator-=(const NRMat<double>  &rhs) {
 #ifdef CUDALA
 	if(location == cpu){
 #endif
-		cblas_daxpy(nn*mm, -1.0, rhs, 1, *this, 1);
+		cblas_daxpy((size_t)nn*mm, -1.0, rhs, 1, *this, 1);
 #ifdef CUDALA
 	}else{
-		cublasDaxpy(nn*mm, -1.0, rhs, 1, v, 1);
+		cublasDaxpy((size_t)nn*mm, -1.0, rhs, 1, v, 1);
 	}
 #endif
 	return *this;
@@ -1395,10 +1392,10 @@ NRMat< complex<double> >::operator-=(const NRMat< complex<double> >  &rhs) {
 #ifdef CUDALA
 	if(location == cpu){
 #endif
-		cblas_zaxpy(nn*mm, &CMONE, rhs[0], 1, (*this)[0], 1);
+		cblas_zaxpy((size_t)nn*mm, &CMONE, rhs[0], 1, (*this)[0], 1);
 #ifdef CUDALA
 	}else{
-		cublasZaxpy(nn*mm, CUMONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1);
+		cublasZaxpy((size_t)nn*mm, CUMONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1);
 	}
 #endif
 	return *this;
@@ -1421,9 +1418,9 @@ NRMat<T> & NRMat<T>::operator-=(const NRMat<T>  &rhs) {
 	copyonwrite();
 	#ifdef MATPTR
-		for(int i=0; i< nn*mm; i++) v[0][i] += rhs.v[0][i];
+		for(size_t i=0; i< (size_t)nn*mm; i++) v[0][i] += rhs.v[0][i];
 	#else
-		for(int i=0; i< nn*mm; i++) v[i] += rhs.v[i];
+		for(size_t i=0; i<(size_t) nn*mm; i++) v[i] += rhs.v[i];
 	#endif
        return *this;
 }
@@ -1693,10 +1690,10 @@ const double NRMat<double>::dot(const NRMat<double> &rhs) const {
 #ifdef CUDALA
 	if(location == cpu){
 #endif
-		ret = cblas_ddot(nn*mm, (*this)[0], 1, rhs[0], 1);
+		ret = cblas_ddot((size_t)nn*mm, (*this)[0], 1, rhs[0], 1);
 #ifdef CUDALA
 	}else{
-		ret = cublasDdot(nn*mm, v, 1, rhs.v, 1);
+		ret = cublasDdot((size_t)nn*mm, v, 1, rhs.v, 1);
 	}
 #endif
 	return ret;
@@ -1721,10 +1718,10 @@ NRMat<complex<double> >::dot(const NRMat<complex<double> > &rhs) const {
 #ifdef CUDALA
 	if(location == cpu){
 #endif
-		cblas_zdotc_sub(nn*mm, (*this)[0], 1, rhs[0], 1, &ret);
+		cblas_zdotc_sub((size_t)nn*mm, (*this)[0], 1, rhs[0], 1, &ret);
 #ifdef CUDALA
 	}else{
-		cuDoubleComplex val = cublasZdotc(nn*mm, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)(rhs.v), 1);
+		cuDoubleComplex val = cublasZdotc((size_t)nn*mm, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)(rhs.v), 1);
 		ret = *(reinterpret_cast<complex<double>*> (&val));
 	}
 #endif
@@ -1804,7 +1801,7 @@ void NRMat<double>::diagmultl(const NRVec<double> &rhs) {
 		for(register int i=0; i<nn; i++){ cblas_dscal(mm, rhs[i], (*this)[i], 1); }
 #ifdef CUDALA
 	}else{
-		for(register int i=0; i<nn; i++){ cublasDscal(mm, rhs[i], v + i*mm, 1); }
+		for(register int i=0; i<nn; i++){ cublasDscal(mm, rhs[i], v + i*(size_t)mm, 1); }
 	}
 #endif
 }
@@ -1830,7 +1827,7 @@ void NRMat< complex<double> >::diagmultl(const NRVec< complex<double> > &rhs) {
 	}else{
 		for(register int i=0; i<nn; i++){
 			const cuDoubleComplex alpha = make_cuDoubleComplex(rhs[i].real(), rhs[i].imag());
-			cublasZscal(mm, alpha, (cuDoubleComplex*)(v + i*mm), 1);
+			cublasZscal(mm, alpha, (cuDoubleComplex*)(v + i*(size_t)mm), 1);
 		}
 	}
 #endif
@@ -1913,7 +1910,7 @@ NRMat<double>::operator*(const NRSMat<double> &rhs) const {
 #ifdef CUDALA
 	}else{
 		for(register int i=0; i<nn; i++){
-			cublasDspmv('U', mm, 1.0, rhs.v, v + i*mm, 1, 0.0, result.v + i*rhs_ncols, 1);
+			cublasDspmv('U', mm, 1.0, rhs.v, v + i*(size_t)mm, 1, 0.0, result.v + i*(size_t)rhs_ncols, 1);
 		}
 	}
 #endif
@@ -1947,7 +1944,7 @@ NRMat< complex<double> >::operator*(const NRSMat< complex<double> > &rhs) const
 #ifdef CUDALA
 	}else{
 		for(register int i=0; i<nn; i++){
-			cublasZhpmv('U', mm, CUONE, (cuDoubleComplex*)rhs.v, (cuDoubleComplex*)(v + i*mm), 1, CUZERO, (cuDoubleComplex*)(result.v + i*rhs_ncols), 1);
+			cublasZhpmv('U', mm, CUONE, (cuDoubleComplex*)rhs.v, (cuDoubleComplex*)(v + i*(size_t)mm), 1, CUZERO, (cuDoubleComplex*)(result.v + i*(size_t)rhs_ncols), 1);
 		}
 	}
 #endif
@@ -1974,10 +1971,10 @@ NRMat<complex<double> >& NRMat<complex<double> >::conjugateme() {
 #ifdef CUDALA
 	if(location == cpu){
 #endif
-		cblas_dscal(mm*nn, -1.0, (double *)((*this)[0]) + 1, 2);
+		cblas_dscal((size_t)mm*nn, -1.0, (double *)((*this)[0]) + 1, 2);
 #ifdef CUDALA
 	}else{
-		cublasDscal(mm*nn, -1.0, (double *)(this->v) + 1, 2);
+		cublasDscal((size_t)mm*nn, -1.0, (double *)(this->v) + 1, 2);
 	}
 #endif
 	return *this;
@@ -2048,12 +2045,12 @@ void NRMat<double>::gemm(const double &beta, const NRMat<double> &a,
 		const char transa, const NRMat<double> &b, const char transb, 
 		const double &alpha) {
-	int k(transa=='n'?a.mm:a.nn);
+	int k(tolower(transa)=='n'?a.mm:a.nn);
 #ifdef DEBUG
-	int l(transa=='n'?a.nn:a.mm);
+	int l(tolower(transa)=='n'?a.nn:a.mm);
-	int kk(transb=='n'?b.nn:b.mm);
+	int kk(tolower(transb)=='n'?b.nn:b.mm);
-	int ll(transb=='n'?b.mm:b.nn);
+	int ll(tolower(transb)=='n'?b.mm:b.nn);
 	if (l!=nn || ll!=mm || k!=kk) laerror("incompatible matrices in NRMat<double>::gemm(...)");
 	if(b.mm <=0 || mm<=0) laerror("illegal matrix dimension in gemm");
 #endif
@@ -2066,8 +2063,8 @@ void NRMat<double>::gemm(const double &beta, const NRMat<double> &a,
 #ifdef CUDALA
 	if(location == cpu){
 #endif
-		cblas_dgemm(CblasRowMajor, (transa=='n' ? CblasNoTrans : CblasTrans),
+		cblas_dgemm(CblasRowMajor, (tolower(transa)=='n' ? CblasNoTrans : CblasTrans),
-				(transb=='n' ? CblasNoTrans : CblasTrans), nn, mm, k, alpha, a,
+				(tolower(transb)=='n' ? CblasNoTrans : CblasTrans), nn, mm, k, alpha, a,
 				a.mm, b , b.mm, beta, *this , mm);
 #ifdef CUDALA
 	}else{
@@ -2083,20 +2080,20 @@ void NRMat<complex<double> >::gemm(const complex<double> & beta,
 		const NRMat<complex<double> > & b, const char transb, 
 		const complex<double> & alpha)
 {
-	int k(transa=='n'?a.mm:a.nn);
+	int k(tolower(transa)=='n'?a.mm:a.nn);
 #ifdef DEBUG
-	int l(transa=='n'?a.nn:a.mm);
+	int l(tolower(transa)=='n'?a.nn:a.mm);
-	int kk(transb=='n'?b.nn:b.mm);
+	int kk(tolower(transb)=='n'?b.nn:b.mm);
-	int ll(transb=='n'?b.mm:b.nn);
+	int ll(tolower(transb)=='n'?b.mm:b.nn);
 	if (l!=nn || ll!=mm || k!=kk) laerror("incompatible matrices in NRMat<complex<double> >::gemm(...)");
 #endif
 	if (alpha==CZERO && beta==CONE) return;
 	copyonwrite();
 	cblas_zgemm(CblasRowMajor,
-			(transa=='n' ? CblasNoTrans : (transa=='c'?CblasConjTrans:CblasTrans)), 
+			(tolower(transa)=='n' ? CblasNoTrans : (tolower(transa)=='c'?CblasConjTrans:CblasTrans)), 
-			(transb=='n' ? CblasNoTrans : (transa=='c'?CblasConjTrans:CblasTrans)),
+			(tolower(transb)=='n' ? CblasNoTrans : (tolower(transb)=='c'?CblasConjTrans:CblasTrans)),
 			nn, mm, k, &alpha, a , a.mm, b , b.mm, &beta, *this , mm);
 }
@@ -2113,10 +2110,10 @@ const double  NRMat<double>::norm(const double scalar) const {
 #ifdef CUDALA
 		if(location == cpu){
 #endif
-			return cblas_dnrm2(nn*mm, (*this)[0], 1);
+			return cblas_dnrm2((size_t)nn*mm, (*this)[0], 1);
 #ifdef CUDALA
 		}else{
-			return cublasDnrm2(nn*mm, v, 1);
+			return cublasDnrm2((size_t)nn*mm, v, 1);
 		}
 #endif
 	}
@@ -2130,7 +2127,7 @@ const double  NRMat<double>::norm(const double scalar) const {
 #ifdef MATPTR
 			tmp = v[i][j];
 #else
-			tmp = v[i*mm+j];
+			tmp = v[i*(size_t)mm+j];
 #endif
 			if(i == j) tmp -= scalar;
 			sum += tmp*tmp;
@@ -2152,10 +2149,10 @@ const double NRMat<complex<double> >::norm(const complex<double> scalar) const {
 #ifdef CUDALA
 		if(location == cpu){
 #endif
-			return cblas_dznrm2(nn*mm, (*this)[0], 1);
+			return cblas_dznrm2((size_t)nn*mm, (*this)[0], 1);
 #ifdef CUDALA
 		}else{
-			return cublasDznrm2(nn*mm, (cuDoubleComplex*)v, 1);
+			return cublasDznrm2((size_t)nn*mm, (cuDoubleComplex*)v, 1);
 		}
 #endif
 	}
@@ -2168,7 +2165,7 @@ const double NRMat<complex<double> >::norm(const complex<double> scalar) const {
 #ifdef MATPTR
 			tmp = v[i][j];
 #else
-			tmp = v[i*mm+j];
+			tmp = v[i*(size_t)mm+j];
 #endif
 			if(i == j) tmp -= scalar;
 			const double re = tmp.real();
@@ -2195,10 +2192,10 @@ void NRMat<double>::axpy(const double alpha, const NRMat<double> &mat) {
 #ifdef CUDALA
 	if(location == cpu){
 #endif
-		cblas_daxpy(nn*mm, alpha, mat, 1, *this, 1);
+		cblas_daxpy((size_t)nn*mm, alpha, mat, 1, *this, 1);
 #ifdef CUDALA
 	}else{
-		cublasDaxpy(nn*mm, alpha, mat, 1, *this, 1);
+		cublasDaxpy((size_t)nn*mm, alpha, mat, 1, *this, 1);
 	}
 #endif
 }
@@ -2221,7 +2218,7 @@ void NRMat<complex<double> >::axpy(const complex<double> alpha,
 #ifdef CUDALA
 	if(location == cpu){
 #endif
-		cblas_zaxpy(nn*mm, &alpha, mat, 1, (*this)[0], 1);
+		cblas_zaxpy((size_t)nn*mm, &alpha, mat, 1, (*this)[0], 1);
 #ifdef CUDALA
 	}else{
 		const cuDoubleComplex _alpha = make_cuDoubleComplex(alpha.real(), alpha.imag());
@@ -2245,7 +2242,7 @@ const T NRMat<T>::trace() const {
 #ifdef MATPTR
 	for(register int i=0; i<nn; ++i) sum += v[i][i];
 #else
-	for(register int i=0; i<nn*nn; i += (nn+1)) sum += v[i];
+	for(register size_t i=0; i<(size_t)nn*nn; i += (nn+1)) sum += v[i];
 #endif
 	return sum;
 }
@@ -2554,7 +2551,7 @@ NRMat<double>& NRMat<double>::swap_rows(){
 #ifdef CUDALA
 	}else{
 	        for(register int i=0; i<n_pul; i++){
-		        cublasDswap(mm, v + i*mm, 1, v + (nn - i - 1)*mm, 1);
+		        cublasDswap(mm, v + i*(size_t)mm, 1, v + (nn - i - 1)*mm, 1);
 			TEST_CUBLAS("cublasDswap");
 	        }
 	}
@@ -2580,7 +2577,7 @@ NRMat<complex<double> >& NRMat<complex<double> >::swap_rows(){
 #ifdef CUDALA
 	}else{
 		for(register int i=0; i<n_pul; i++){
-		        cublasZswap(mm, (cuDoubleComplex*)(v + i*mm), 1, (cuDoubleComplex*)(v + (nn - i - 1)*mm), 1);
+		        cublasZswap(mm, (cuDoubleComplex*)(v + i*(size_t)mm), 1, (cuDoubleComplex*)(v + (nn - i - 1)*mm), 1);
 			TEST_CUBLAS("cublasZswap");
 		}
 	}
@@ -2613,7 +2610,7 @@ NRMat<T>& NRMat<T>::swap_rows(){
 	}else{
 		if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem in NRMat<T>::swap_rows");
 	        for(register int i=0; i<n_pul; i++){
-		        cublasSswap(mm*sizeof(T)/sizeof(float), (float *)(v + i*mm), 1, (float *)(v + (nn - i - 1)*mm), 1);
+		        cublasSswap(mm*sizeof(T)/sizeof(float), (float *)(v + i*(size_t)mm), 1, (float *)(v + (nn - i - 1)*mm), 1);
 			TEST_CUBLAS("cublasSswap");
 	        }
 	}
@@ -2745,7 +2742,7 @@ NRMat<double>& NRMat<double>::swap_rows_cols(){
 #ifdef CUDALA
 	}else{
 		for(register int i=0; i<n_pul; i++){
-			cublasDswap(mm, v + i*mm, 1, v + (nn - i - 1)*mm + mm - 1, -1);
+			cublasDswap(mm, v + i*(size_t)mm, 1, v + (nn - i - 1)*mm + mm - 1, -1);
 			TEST_CUBLAS("cublasDswap");
 		}
@@ -2792,7 +2789,7 @@ NRMat<complex<double> >& NRMat<complex<double> >::swap_rows_cols(){
 #ifdef CUDALA
 	}else{
 		for(register int i=0;i<n_pul;i++){
-			cublasZswap(mm, (cuDoubleComplex*)(v + i*mm), 1, (cuDoubleComplex*)(v + (nn - i - 1)*mm + mm - 1), -1);
+			cublasZswap(mm, (cuDoubleComplex*)(v + i*(size_t)mm), 1, (cuDoubleComplex*)(v + (nn - i - 1)*mm + mm - 1), -1);
 			TEST_CUBLAS("cublasZswap");
 		}
 		if(nn & 1){
@@ -2817,7 +2814,7 @@ template<typename T>
 NRMat<T>& NRMat<T>::swap_rows_cols(){
 	const int n_pul = nn >> 1;
 	const int m_pul = mm >> 1;
-	const int dim = nn*mm;
+	const size_t dim = (size_t)nn*mm;
 	T *data_ptr;
 	T tmp;
@@ -2837,7 +2834,7 @@ NRMat<T>& NRMat<T>::swap_rows_cols(){
 	}else{
 		if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem in NRMat<T>::swap_rows_cols");
 	        for(register int i=0; i<n_pul; i++){
-		        cublasSswap(mm*sizeof(T)/sizeof(float), (float *)(v + i*mm), 1, (float *)(v + (nn - i - 1)*mm) - 1, -1);
+		        cublasSswap(mm*sizeof(T)/sizeof(float), (float *)(v + i*(size_t)mm), 1, (float *)(v + (nn - i - 1)*mm) - 1, -1);
 			TEST_CUBLAS("cublasSswap");
 	        }
--- a/mat.h
+++ b/mat.h
@@ -39,10 +39,10 @@ protected:
 	T *v;//!< pointer to the data stored continuously in emmory
 #endif
 	int *count;//!< reference counter
 public:
 #ifdef CUDALA
 	GPUID location;
 #endif
 public:
 	friend class NRVec<T>;
 	friend class NRSMat<T>;
@@ -89,16 +89,16 @@ public:
 	//! explicit constructor converting vector into a <code>NRMat<T></code> object
 #ifdef MATPTR
 	explicit NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset = 0):NRMat(&rhs[0][0] + offset , n, m){
-		if (offset < 0 || n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");
+		if (offset < 0 || (size_t)n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");
 	};
 #else
 	explicit NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset = 0);
 #endif
 #ifdef MATPTR
-	const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits<T>::gencmp(v[0],rhs.v[0],nn*mm);} //memcmp for scalars else elementwise
+	const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits<T>::gencmp(v[0],rhs.v[0],(size_t)nn*mm);} //memcmp for scalars else elementwise
 #else
-        const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits<T>::gencmp(v,rhs.v,nn*mm);} //memcmp for scalars else elementwise
+        const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits<T>::gencmp(v,rhs.v,(size_t)nn*mm);} //memcmp for scalars else elementwise
 #endif
 	const bool operator==(const NRMat &rhs) const {return !(*this != rhs);};
@@ -107,7 +107,7 @@ public:
 	inline int getcount() const {return count?*count:0;}
 	//! ensure that the data of this matrix are referenced exactly once
-	void copyonwrite();
+	void copyonwrite(bool detachonly=false);
 	/***************************************************************************//**
 	 * routines for CUDA related stuff
@@ -260,7 +260,7 @@ public:
 	//! get the number of columns
 	inline int ncols() const;
 	//! get the number of matrix elements
-	inline int size() const;
+	inline size_t size() const;
 	//! unformatted input
        void get(int fd, bool dimensions = 1, bool transposed = false);
@@ -274,8 +274,8 @@ public:
 	//! set all matrix elements equal to zero
        void clear(){
 		if(nn&&mm){
-			copyonwrite();
+			copyonwrite(true);
-			LA_traits<T>::clear((*this)[0], nn*mm);
+			LA_traits<T>::clear((*this)[0], (size_t)nn*mm);
 		}
 	};
@@ -379,7 +379,7 @@ template <typename T>
 NRMat<T>::NRMat(const int n, const int m, const GPUID loc) : nn(n), mm(m), count(new int) {
 	T* p;
 	*count = 1;
-	const int nm = n*m;
+	const size_t nm = (size_t)n*m;
 #ifdef CUDALA
 	location = (loc==undefined?DEFAULT_LOC:loc);
 	if(location == cpu) {
@@ -408,7 +408,7 @@ NRMat<T>::NRMat(const int n, const int m, const GPUID loc) : nn(n), mm(m), count
 ******************************************************************************/
 template <typename T>
 NRMat<T>::NRMat(const T &a, const int n, const int m, const GPUID loc) : nn(n), mm(m), count(new int) {
-	const int nm = n*m;
+	const size_t nm = (size_t)n*m;
 	T *p;
 	*count = 1;
@@ -447,7 +447,7 @@ NRMat<T>::NRMat(const T &a, const int n, const int m, const GPUID loc) : nn(n),
 ******************************************************************************/
 template <typename T>
 NRMat<T>::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new int) {
-	const int nm = n*m;
+	const size_t nm = (size_t)n*m;
 	T *p;
 	*count = 1;
@@ -460,7 +460,7 @@ NRMat<T>::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new
 		p = v[0] = new T[nm];
 		for (register int i=1; i<n; i++) v[i] = v[i-1] + m;
 	#else
-		p = v = new T[m*n];
+		p = v = new T[nm];
 	#endif
 	if (a != (T)0)
 		for (register int i=0; i<nm; i++) *p++ = a;
@@ -483,7 +483,7 @@ NRMat<T>::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new
 ******************************************************************************/
 template <typename T>
 NRMat<T>::NRMat(const T *a, const int n, const int m) : nn(n), mm(m), count(new int) {
-	const int nm = n*m;
+	const size_t nm = (size_t)n*m;
 #ifdef CUDALA
 	location = DEFAULT_LOC;
 #endif
@@ -546,10 +546,10 @@ NRMat<T>::NRMat(const NRSMat<T> &rhs) {
 	*count = 1;
 #ifdef MATPTR
 	v = new T*[nn];
-	v[0] = new T[mm*nn];
+	v[0] = new T[(size_t)mm*nn];
 	for (int i=1; i<nn; i++) v[i] = v[i-1] + mm;
 #else
-	v = new T[mm*nn];
+	v = new T[(size_t)mm*nn];
 #endif
 #ifdef MATPTR
@@ -561,7 +561,7 @@ NRMat<T>::NRMat(const NRSMat<T> &rhs) {
 #else
 	for (i=0; i<nn; i++){
 		for (j=0; j<=i; j++){
-			v[i*nn + j] = v[j*nn + i] = rhs[k++];
+			v[i*(size_t)nn + j] = v[j*(size_t)nn + i] = rhs[k++];
 		}
 	}
 #endif
@@ -578,7 +578,7 @@ NRMat<T>::NRMat(const NRSMat<T> &rhs) {
 template <typename T>
 NRMat<T>::NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset)
 {
-	if (offset < 0 || n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");
+	if (offset < 0 || (size_t)n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");
 #ifdef CUDALA
 	location=rhs.location;
@@ -628,7 +628,7 @@ inline T* NRMat<T>::operator[](const int i) {
 	#ifdef MATPTR
 		return v[i];
 	#else
-		return v + i*mm;
+		return v + i*(size_t)mm;
 	#endif
 }
@@ -646,7 +646,7 @@ inline const T* NRMat<T>::operator[](const int i) const {
 	#ifdef MATPTR
 		return v[i];
 	#else
-		return v + i*mm;
+		return v + i*(size_t)mm;
 	#endif
 }
@@ -668,7 +668,7 @@ inline T& NRMat<T>::operator()(const int i, const int j){
 	#ifdef MATPTR
 		return v[i][j];
 	#else
-		return v[i*mm + j];
+		return v[i*(size_t)mm + j];
 	#endif
 }
@@ -689,7 +689,7 @@ inline const T& NRMat<T>::operator()(const int i, const int j) const{
 	#ifdef MATPTR
 		return v[i][j];
 	#else
-		return v[i*mm + j];
+		return v[i*(size_t)mm + j];
 	#endif
 }
@@ -712,11 +712,11 @@ inline const T NRMat<T>::get_ij(const int i, const int j) const{
 	#ifdef MATPTR
 		return v[i][j];
 	#else
-		return v[i*mm + j];
+		return v[i*(size_t)mm + j];
 	#endif
 #ifdef CUDALA
 	}else{
-		const int pozice = i*mm + j;
+		const size_t pozice = i*(size_t)mm + j;
 		gpuget(1, sizeof(T), v + pozice, &ret);
 		return ret;
 	}
@@ -743,8 +743,8 @@ inline int NRMat<T>::ncols() const{
 * @return number of elements
 ******************************************************************************/
 template <typename T>
-inline int NRMat<T>::size() const{
+inline size_t NRMat<T>::size() const{
-	return nn*mm;
+	return (size_t)nn*mm;
 }
 /***************************************************************************//**
@@ -795,7 +795,7 @@ inline const double NRMat<double>::amax() const{
 #ifdef CUDALA
 	}else{
 		double ret(0.0);
-		const int pozice = cublasIdamax(nn*mm, v, 1) - 1;
+		const size_t pozice = cublasIdamax((size_t)nn*mm, v, 1) - 1;
 		TEST_CUBLAS("cublasIdamax");
 		gpuget(1, sizeof(double), v + pozice, &ret);
 		return ret;
@@ -815,7 +815,7 @@ inline const double NRMat<double>::amin() const{
 	if(location == cpu){
 #endif
 		// idamin seems not to be supported
-		const int nm = nn*mm;
+		const size_t nm = (size_t)nn*mm;
 		double val(0.0);
 		int index(-1);
 		ret = std::numeric_limits<double>::max();
@@ -834,7 +834,7 @@ inline const double NRMat<double>::amin() const{
 		#endif	
 #ifdef CUDALA
 	}else{
-		const int pozice = cublasIdamin(nn*mm, v, 1) - 1;
+		const size_t pozice = cublasIdamin((size_t)nn*mm, v, 1) - 1;
 		TEST_CUBLAS("cublasIdamin");
 		gpuget(1, sizeof(double), v + pozice, &ret);
 	}
@@ -860,7 +860,7 @@ inline const complex<double> NRMat<complex<double> >::amax() const{
 #ifdef CUDALA
 	}else{
 		complex<double> ret(0.0, 0.0);
-		const int pozice = cublasIzamax(nn*mm, (cuDoubleComplex*)v, 1) - 1;
+		const size_t pozice = cublasIzamax((size_t)nn*mm, (cuDoubleComplex*)v, 1) - 1;
 		TEST_CUBLAS("cublasIzamax");
 		gpuget(1, sizeof(complex<double>), v + pozice, &ret);
 		return ret;
@@ -881,7 +881,7 @@ inline const complex<double> NRMat<complex<double> >::amin() const{
 	if(location == cpu){
 #endif
 		// idamin seems not to be supported
-		const int nm = nn*mm;
+		const size_t nm = (size_t)nn*mm;
 		int index(-1);
 		double val(0.0), min_val(0.0);
 		complex<double> z_val(0.0, 0.0);
@@ -903,7 +903,7 @@ inline const complex<double> NRMat<complex<double> >::amin() const{
 		#endif	
 #ifdef CUDALA
 	}else{
-		const int pozice = cublasIzamin(nn*mm, (cuDoubleComplex*)v, 1) - 1;
+		const size_t pozice = cublasIzamin((size_t)nn*mm, (cuDoubleComplex*)v, 1) - 1;
 		TEST_CUBLAS("cublasIzamin");
 		gpuget(1, sizeof(complex<double>), v + pozice, &ret);
 	}
@@ -991,7 +991,7 @@ NRMat<T> & NRMat<T>::operator|=(const NRMat<T> &rhs) {
 * @see NRMat<T>::count, NRMat<T>::operator|=() 
 ******************************************************************************/
 template <typename T>
-void NRMat<T>::copyonwrite() {
+void NRMat<T>::copyonwrite(bool detachonly) {
 	if(!count) laerror("attempt to call copyonwrite() for a matrix with count == 0");
 	if(*count > 1){
 		(*count)--;
@@ -1002,20 +1002,20 @@ void NRMat<T>::copyonwrite() {
 #endif
 		#ifdef MATPTR
 			T **newv = new T*[nn];
-			newv[0] = new T[mm*nn];
+			newv[0] = new T[(size_t)mm*nn];
-			memcpy(newv[0], v[0], mm*nn*sizeof(T));
+			if(!detachonly) memcpy(newv[0], v[0], (size_t)mm*nn*sizeof(T));
 			v = newv;
 			for(register int i=1; i<nn; i++) v[i] = v[i-1] + mm;
 		#else
-			T *newv = new T[mm*nn];
+			T *newv = new T[(size_t)mm*nn];
-			memcpy(newv, v, mm*nn*sizeof(T));
+			if(!detachonly) memcpy(newv, v, (size_t)mm*nn*sizeof(T));
 			v = newv;
 		#endif
 #ifdef CUDALA
 		}else{ //matrix is in GPU memory
-			T *newv = (T *) gpualloc(mm*nn*sizeof(T));
+			T *newv = (T *) gpualloc((size_t)mm*nn*sizeof(T));
 			if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem");
-			cublasScopy(nn*mm*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
+			if(!detachonly) cublasScopy(nn*mm*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
 			TEST_CUBLAS("cublasScopy");
 			v = newv;
 		}
@@ -1082,14 +1082,14 @@ void NRMat<T>::resize(int n, int m) {
 #endif
 		#ifdef MATPTR
 			v = new T*[nn];
-			v[0] = new T[m*n];
+			v[0] = new T[(size_t)m*n];
 			for (register int i=1; i< n; i++) v[i] = v[i-1] + m;
 		#else
-			v = new T[m*n];
+			v = new T[(size_t)m*n];
 		#endif
 #ifdef CUDALA
 		}else{
-			v = (T *) gpualloc(n*m*sizeof(T));
+			v = (T *) gpualloc((size_t)n*m*sizeof(T));
 		}
 #endif
                return;
@@ -1108,15 +1108,15 @@ void NRMat<T>::resize(int n, int m) {
 			delete[] v;
 		#ifdef MATPTR
 			v = new T*[nn];
-			v[0] = new T[m*n];
+			v[0] = new T[(size_t)m*n];
 			for (int i=1; i< n; i++) v[i] = v[i-1] + m;
 		#else
-			v = new T[m*n];
+			v = new T[(size_t)m*n];
 		#endif
 #ifdef CUDALA
 		}else{
 			gpufree(v);
-			v=(T *) gpualloc(n*m*sizeof(T));
+			v=(T *) gpualloc((size_t)n*m*sizeof(T));
 		}
 #endif
 	}
@@ -1228,7 +1228,7 @@ public:
 	#ifdef MATPTR
 		return NRMat<T>::v[i - 1][j - 1];
 	#else
-		return NRMat<T>::v[(i-1)*NRMat<T>::mm+j-1];
+		return NRMat<T>::v[(i-1)*(size_t)NRMat<T>::mm+j-1];
 	#endif
 	}
@@ -1258,11 +1258,11 @@ public:
 	#ifdef MATPTR
 			return NRMat<T>::v[i - 1][j - 1];
 	#else
-			return NRMat<T>::v[(i-1)*NRMat<T>::mm + (j-1)];
+			return NRMat<T>::v[(size_t)(i-1)*NRMat<T>::mm + (j-1)];
 	#endif
 	#ifdef CUDALA
 		}else{
-			const int pozice = (i-1)*NRMat<T>::mm + (j-1);
+			const size_t pozice = (size_t)(i-1)*NRMat<T>::mm + (j-1);
 			gpuget(1, sizeof(T), NRMat<T>::v + pozice, &ret);
 			return ret;
 		}
@@ -1286,10 +1286,10 @@ NRMat<T>& NRMat<T>::operator^=(const NRMat<T> &rhs){
 	copyonwrite();// ensure that *count == 1
 #ifdef MATPTR
-	for (register int i=0; i< nn*mm; i++) v[0][i] *= rhs.v[0][i];
+	for (register size_t i=0; i< (size_t)nn*mm; i++) v[0][i] *= rhs.v[0][i];
 #else
-	const int Dim = nn*mm;
+	const size_t Dim = (size_t)nn*mm;
-	for(register int i=0; i<Dim; i++) v[i] *= rhs.v[i];
+	for(register size_t i=0; i<Dim; i++) v[i] *= rhs.v[i];
 #endif
 	return *this;
 }
@@ -1320,14 +1320,14 @@ void NRMat<T>::moveto(const GPUID dest) {
 	T *vold = v;
 	if(dest == cpu){ //moving from GPU to CPU
-		v = new T[nn*mm];
+		v = new T[(size_t)nn*mm];
-		gpuget(nn*mm, sizeof(T), vold, v);
+		gpuget((size_t)nn*mm, sizeof(T), vold, v);
 		if(*count == 1){ gpufree(vold); }
 		else{ --(*count); count = new int(1); }
 	}else{ //moving from CPU to GPU
-		v = (T *) gpualloc(nn*mm*sizeof(T));
+		v = (T *) gpualloc((size_t)nn*mm*sizeof(T));
-		gpuput(nn*mm, sizeof(T), vold, v);
+		gpuput((size_t)nn*mm, sizeof(T), vold, v);
 		if(*count == 1) delete[] vold;
 		else{ --(*count); count = new int(1);}
 	}
@@ -1351,3 +1351,4 @@ NRVECMAT_OPER2(Mat, -)
 }//end of the LA-namespace
 #endif/* _LA_MAT_H_ */
--- a/noncblas.cc
+++ b/noncblas.cc
@@ -195,13 +195,14 @@ void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
 {
 	if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
 	if(Uplo!=CblasLower) laerror("CblasLower uplo asserted");
 	char U = BLAS_FORTRANCASE('u');
 #ifdef FORINT
 	const FINT ntmp=N;
 	const FINT incxtmp=incX;
 	const FINT incytmp=incY;
-	FORNAME(dspmv) ("U",&ntmp, &alpha, Ap, X, &incxtmp, &beta, Y, &incytmp);
+	FORNAME(dspmv) (&U,&ntmp, &alpha, Ap, X, &incxtmp, &beta, Y, &incytmp);
 #else
-	FORNAME(dspmv) ("U",&N, &alpha, Ap, X, &incX, &beta, Y, &incY);
+	FORNAME(dspmv) (&U,&N, &alpha, Ap, X, &incX, &beta, Y, &incY);
 #endif
 }
@@ -214,13 +215,14 @@ void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
 {
 	if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
 	if(Uplo!=CblasLower) laerror("CblasLower uplo asserted");
 	char U = BLAS_FORTRANCASE('u');
 #ifdef FORINT
 	const FINT ntmp=N;
 	const FINT incxtmp=incX;
 	const FINT incytmp=incY;
-	FORNAME(zhpmv) ("U",&ntmp, alpha, Ap, X, &incxtmp, beta, Y, &incytmp);
+	FORNAME(zhpmv) (&U,&ntmp, alpha, Ap, X, &incxtmp, beta, Y, &incytmp);
 #else
-	FORNAME(zhpmv) ("U",&N, alpha, Ap, X, &incX, beta, Y, &incY);
+	FORNAME(zhpmv) (&U,&N, alpha, Ap, X, &incX, beta, Y, &incY);
 #endif
 }
@@ -298,6 +300,8 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
 {
 	if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
 	//swap a-b, m-n
 	char transb = BLAS_FORTRANCASE(TransB==CblasNoTrans?'N':'T');
 	char transa = BLAS_FORTRANCASE(TransA==CblasNoTrans?'N':'T');
 #ifdef FORINT
 	const FINT mtmp=M;
 	const FINT ntmp=N;
@@ -305,10 +309,10 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
 	const FINT ldatmp=lda;
 	const FINT ldbtmp=ldb;
 	const FINT ldctmp=ldc;
-	FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T",
+	FORNAME(dgemm) (&transb,&transa,
 			&ntmp, &mtmp, &ktmp, &alpha, B, &ldbtmp, A, &ldatmp, &beta, C, &ldctmp);
 #else
-	FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T",
+	FORNAME(dgemm) (&transb,&transa,
 			&N, &M, &K, &alpha, B, &ldb, A, &lda, &beta, C, &ldc);
 #endif
 }
@@ -322,6 +326,8 @@ void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
 {
 	if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
 	//swap a-b, m-n
 	char transb =  BLAS_FORTRANCASE(TransB==CblasConjTrans?'C':(TransB==CblasNoTrans?'N':'T'));
 	char transa =  BLAS_FORTRANCASE(TransA==CblasConjTrans?'C':(TransA==CblasNoTrans?'N':'T'));
 #ifdef FORINT
 	const FINT mtmp=M;
 	const FINT ntmp=N;
@@ -329,12 +335,10 @@ void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
 	const FINT ldatmp=lda;
 	const FINT ldbtmp=ldb;
 	const FINT ldctmp=ldc;
-	FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
+	FORNAME(zgemm) (&transb,&transa,
 			TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
 			&ntmp, &mtmp, &ktmp, alpha, B, &ldbtmp, A, &ldatmp, beta, C, &ldctmp);
 #else
-	FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
+	FORNAME(zgemm) ( &transb,&transa,
 			TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
 			&N, &M, &K, alpha, B, &ldb, A, &lda, beta, C, &ldc);
 #endif
 }
@@ -347,19 +351,21 @@ void cblas_dgemv(const enum CBLAS_ORDER Order,
 		const double *X, const int incX, const double beta,
 		double *Y, const int incY)
 {
 	char transa =  BLAS_FORTRANCASE(TransA==CblasNoTrans?'N':'T');
 	char transax =  BLAS_FORTRANCASE(TransA==CblasNoTrans?'T':'N');
 #ifdef FORINT
 	const FINT mtmp=M;
 	const FINT ntmp=N;
 	const FINT ldatmp=lda;
 	const FINT incxtmp=incX;
 	const FINT incytmp=incY;
-	if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
+	if(Order!=CblasRowMajor) FORNAME(dgemv) (&transa, &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
 	//swap n-m and toggle transposition
-	else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
+	else FORNAME(dgemv) (&transax, &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
 #else
-	if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
+	if(Order!=CblasRowMajor) FORNAME(dgemv) (&transa, &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
 	//swap n-m and toggle transposition
-	else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
+	else FORNAME(dgemv) (&transax, &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
 #endif
 }
@@ -374,15 +380,16 @@ void cblas_zgemv(const enum CBLAS_ORDER Order,
 	if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
 	if(TransA == CblasConjTrans) laerror("zgemv with CblasConjTrans not supportted");
 	//swap n-m and toggle transposition
 	char transa =  BLAS_FORTRANCASE(TransA==CblasNoTrans?'T':'N');
 #ifdef FORINT
 	const FINT mtmp=M;
 	const FINT ntmp=N;
 	const FINT ldatmp=lda;
 	const FINT incxtmp=incX;
 	const FINT incytmp=incY;
-	FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, alpha, A, &ldatmp, X, &incxtmp, beta, Y, &incytmp );
+	FORNAME(zgemv) (&transa, &ntmp, &mtmp, alpha, A, &ldatmp, X, &incxtmp, beta, Y, &incytmp );
 #else
-	FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, alpha, A, &lda, X, &incX, beta, Y, &incY );
+	FORNAME(zgemv) (&transa, &N, &M, alpha, A, &lda, X, &incX, beta, Y, &incY );
 #endif
 }
--- a/nonclass.cc
+++ b/nonclass.cc
@@ -17,6 +17,7 @@
 */
 //this can be safely included since it contains ifdefs NONCBLAS and NONCLAPACK anyway
 #include "la_traits.h"
 #include "noncblas.h"
 #include "vec.h"
 #include "smat.h"
@@ -196,7 +197,7 @@ static void linear_solve_do(NRSMat<double> &a, double *b, const int nrhs, const
 	FINT r, *ipiv;
 	a.copyonwrite();
 	ipiv = new FINT[n];
-	char U = 'U';
+	char U = LAPACK_FORTRANCASE('u');
 #ifdef FORINT
        const FINT ntmp=n;
        const FINT nrhstmp=nrhs;
@@ -298,9 +299,9 @@ int linear_solve_x(NRMat<double> &_A, double *_B, const int _rhsCount, const int
 	const int A_rows = _A.nrows();
 	const int A_cols = _A.ncols();
-	const char fact  = _eq?'E':'N';
+	const char fact  = LAPACK_FORTRANCASE(_eq?'E':'N');
-	const char trans = 'T';//because of c-order
+	const char trans = LAPACK_FORTRANCASE('T');//because of c-order
-	char equed = 'B';//if fact=='N' then equed is an output argument, therefore not declared as const
+	char equed = LAPACK_FORTRANCASE('B');//if fact=='N' then equed is an output argument, therefore not declared as const
 	if(_eqCount < 0 || _eqCount > A_rows || _eqCount > A_cols || _rhsCount < 0){
 		laerror("linear_solve_x: invalid input matrices");
@@ -371,9 +372,9 @@ int linear_solve_x(NRMat<complex<double> > &_A, complex<double> *_B, const int _
 	const int A_rows = _A.nrows();
 	const int A_cols = _A.ncols();
-	const char fact  = _eq?'E':'N';
+	const char fact  = LAPACK_FORTRANCASE(_eq?'E':'N');
-	const char trans = 'T';//because of c-order
+	const char trans = LAPACK_FORTRANCASE('T');//because of c-order
-	char equed = 'B';//if fact=='N' then equed is an output argument, therefore not declared as const
+	char equed = LAPACK_FORTRANCASE('B');//if fact=='N' then equed is an output argument, therefore not declared as const
 	if(_eqCount < 0 || _eqCount > A_rows || _eqCount > A_cols || _rhsCount < 0){
 		laerror("linear_solve_x: invalid input matrices");
@@ -557,9 +558,9 @@ void diagonalize(NRMat<double> &a, NRVec<double> &w, const bool eivec,
 	if(b) b->copyonwrite();
 	FINT r = 0;
-	char U ='U';
+	char U =LAPACK_FORTRANCASE('u');
-	char vectors = 'V';
+	char vectors = LAPACK_FORTRANCASE('v');
-	if (!eivec) vectors = 'N';
+	if (!eivec) vectors = LAPACK_FORTRANCASE('n');
 	FINT LWORK = -1;
 	double WORKX;
 	FINT ldb=0; if(b) ldb=b->ncols();
@@ -588,7 +589,7 @@ void diagonalize(NRMat<double> &a, NRVec<double> &w, const bool eivec,
 #endif
        delete[] WORK;
-	if (vectors == 'V' && corder) a.transposeme(n);
+	if (LAPACK_FORTRANCASE(vectors) == LAPACK_FORTRANCASE('v') && corder) a.transposeme(n);
 	if (r < 0) laerror("illegal argument in sygv/syev in diagonalize()");
 	if (r > 0) laerror("convergence problem in sygv/syev in diagonalize()");
@@ -620,12 +621,13 @@ void diagonalize(NRMat<complex<double> > &a, NRVec<double> &w, const bool eivec,
 	if(b) b->copyonwrite();
 	FINT r = 0;
-	char U ='U';
+	char U =LAPACK_FORTRANCASE('U');
-	char vectors = 'V';
+	char vectors = LAPACK_FORTRANCASE('V');
-	if (!eivec) vectors = 'N';
+	if (!eivec) vectors = LAPACK_FORTRANCASE('n');
 	FINT LWORK = -1;
 	complex<double> WORKX;
 	FINT ldb=0; if(b) ldb=b->ncols();
 std::cout << "test vectors "<<vectors<<std::endl;
 	// First call is to determine size of workspace
 	double *RWORK = new double[3*n+2]; 
@@ -652,7 +654,7 @@ void diagonalize(NRMat<complex<double> > &a, NRVec<double> &w, const bool eivec,
        delete[] WORK;
 	delete[] RWORK;
-	if (vectors == 'V' && corder) a.transposeme(n);
+	if (LAPACK_FORTRANCASE(vectors) == LAPACK_FORTRANCASE('v') && corder) {a.transposeme(n); a.conjugateme();}
 	if (r < 0) laerror("illegal argument in hegv/heev in diagonalize()");
 	if (r > 0) laerror("convergence problem in hegv/heev in diagonalize()");
@@ -684,8 +686,8 @@ void diagonalize(NRSMat<double> &a, NRVec<double> &w, NRMat<double> *v,
 	if(b) b->copyonwrite();
 	FINT r = 0;
-	char U = 'U';
+	char U = LAPACK_FORTRANCASE('u');
-	char job = v ? 'v' : 'n';
+	char job = LAPACK_FORTRANCASE(v ? 'v' : 'n');
 	double *WORK = new double[3*n];
 	FINT ldv=v?v->ncols():n;
@@ -730,8 +732,8 @@ void diagonalize(NRSMat<complex<double> > &a, NRVec<double> &w, NRMat<complex<do
 	if(b) b->copyonwrite();
 	FINT r = 0;
-	char U = 'U';
+	char U = LAPACK_FORTRANCASE('u');
-	char job = v ? 'v' : 'n';
+	char job = LAPACK_FORTRANCASE(v ? 'v' : 'n');
 	complex<double> *WORK = new complex<double>[2*n];
 	double *RWORK  = new double[3*n];
@@ -890,8 +892,8 @@ void gdiagonalize(NRMat<double> &a, NRVec<double> &wr, NRVec<double> &wi,
 		const int sorttype, const int biorthonormalize,
 		NRMat<double> *b, NRVec<double> *beta)
 {
-	if(n<=0) n = a.nrows();
+	if(n<=0) {n = a.nrows(); if(a.ncols()!=a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");}
-	if (n > a.ncols() || n>a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");
+	if (n > a.ncols() || n>a.nrows() ) laerror("gdiagonalize() of too big submatrix");
 	if (n > wr.size()) 
 		laerror("inconsistent dimension of eigen vector in gdiagonalize()");
 	if (vl) if (n > vl->nrows() || n > vl->ncols())
@@ -911,8 +913,8 @@ void gdiagonalize(NRMat<double> &a, NRVec<double> &wr, NRVec<double> &wi,
 	if (beta) beta->copyonwrite();
 	if (b) b->copyonwrite();
-	char jobvl = vl ? 'V' : 'N';
+	char jobvl = LAPACK_FORTRANCASE(vl ? 'v' : 'n');
-	char jobvr = vr ? 'V' : 'N';
+	char jobvr = LAPACK_FORTRANCASE(vr ? 'v' : 'n');
 	double work0;
 	FINT lwork = -1;
 	FINT r;
@@ -1055,8 +1057,8 @@ void gdiagonalize(NRMat<complex<double> > &a, NRVec< complex<double> > &w,
 		NRMat<complex<double> > *b, NRVec<complex<double> > *beta)
 {
-	if(n<=0) n = a.nrows();
+	if(n<=0) {n = a.nrows(); if(a.ncols()!=a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");}
-	if (n > a.ncols() || n>a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");
+	if (n > a.ncols() || n>a.nrows() ) laerror("gdiagonalize() of too big submatrix");
 	if (n > w.size()) 
 		laerror("inconsistent dimension of eigen vector in gdiagonalize()");
 	if (vl) if (n > vl->nrows() || n > vl->ncols())
@@ -1075,8 +1077,8 @@ void gdiagonalize(NRMat<complex<double> > &a, NRVec< complex<double> > &w,
 	if (beta) beta->copyonwrite();
 	if (b) b->copyonwrite();
-	char jobvl = vl ? 'V' : 'N';
+	char jobvl = LAPACK_FORTRANCASE(vl ? 'v' : 'n');
-	char jobvr = vr ? 'V' : 'N';
+	char jobvr = LAPACK_FORTRANCASE(vr ? 'v' : 'n');
 	complex<double> work0;
 	FINT lwork = -1;
 	FINT r;
@@ -1146,8 +1148,8 @@ void gdiagonalize(NRMat<complex<double> > &a, NRVec< complex<double> > &w,
 	if (corder) {
-		if (vl) vl->transposeme(n);
+		if (vl) {vl->transposeme(n); vl->conjugateme();}
-		if (vr) vr->transposeme(n);
+		if (vr) {vr->transposeme(n); vr->conjugateme();}
 	}
 }
@@ -1159,8 +1161,8 @@ void gdiagonalize(NRMat<double> &a, NRVec< complex<double> > &w,
 		const bool corder, int n, const int sorttype, const int biorthonormalize,
 		NRMat<double> *b, NRVec<double> *beta)
 {
-	if(n<=0)  n = a.nrows();
+	if(n<=0) {n = a.nrows(); if(a.ncols()!=a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");}
-	if(n> a.nrows() || n ==  a.nrows() && n != a.ncols()) laerror("gdiagonalize() call for a non-square matrix");
+	if(n> a.nrows() || n ==  a.nrows() && n != a.ncols()) laerror("gdiagonalize() of too big submatrix");
 	NRVec<double> wr(n), wi(n);
 	NRMat<double> *rvl = 0;
@@ -1226,10 +1228,12 @@ void gdiagonalize(NRMat<double> &a, NRVec< complex<double> > &w,
 template<>
 const NRMat<double> realpart<NRMat< complex<double> > >(const NRMat< complex<double> > &a)
 {
        NRMat<double> result(a.nrows(), a.ncols());
 #ifdef CUDALA
-        if(location == cpu){
+        if(a.location == cpu){
 #endif
-	NRMat<double> result(a.nrows(), a.ncols());
+//	NRMat<double> result(a.nrows(), a.ncols());
 	cblas_dcopy(a.nrows()*a.ncols(), (const double *)a[0], 2, result, 1);
 #ifdef CUDALA
        }else{
@@ -1242,11 +1246,13 @@ const NRMat<double> realpart<NRMat< complex<double> > >(const NRMat< complex<dou
 template<>
 const NRMat<double> imagpart<NRMat< complex<double> > >(const NRMat< complex<double> > &a)
 {
        NRMat<double> result(a.nrows(), a.ncols());
 #ifdef CUDALA
-        if(location == cpu){
+        if(a.location == cpu){
 #endif
-	NRMat<double> result(a.nrows(), a.ncols());
+//	NRMat<double> result(a.nrows(), a.ncols());
 	cblas_dcopy(a.nrows()*a.ncols(), (const double *)a[0]+1, 2, result, 1);
 #ifdef CUDALA
        }else{
@@ -1259,12 +1265,15 @@ const NRMat<double> imagpart<NRMat< complex<double> > >(const NRMat< complex<dou
 template<>
 const NRMat< complex<double> > realmatrix<NRMat<double> > (const NRMat<double> &a)
 {
        NRMat <complex<double> > result(a.nrows(), a.ncols());
 #ifdef CUDALA
-        if(location == cpu){
+        if(a.location == cpu){
 #endif
-	NRMat <complex<double> > result(a.nrows(), a.ncols());
+//	NRMat <complex<double> > result(a.nrows(), a.ncols());
 	cblas_dcopy(a.nrows()*a.ncols(), a, 1, (double *)result[0], 2);
 #ifdef CUDALA
        }else{
@@ -1278,11 +1287,13 @@ const NRMat< complex<double> > realmatrix<NRMat<double> > (const NRMat<double> &
 template<>
 const NRMat< complex<double> > imagmatrix<NRMat<double> > (const NRMat<double> &a)
 {
        NRMat< complex<double> > result(a.nrows(), a.ncols());
 #ifdef CUDALA
-        if(location == cpu){
+        if(a.location == cpu){
 #endif
-	NRMat< complex<double> > result(a.nrows(), a.ncols());
+//	NRMat< complex<double> > result(a.nrows(), a.ncols());
 	cblas_dcopy(a.nrows()*a.ncols(), a, 1, (double *)result[0]+1, 2);
 #ifdef CUDALA
        }else{
@@ -1577,7 +1588,7 @@ void cholesky(NRMat<double> &a, bool upper)
 if(a.nrows()!=a.ncols()) laerror("matrix must be square in Cholesky");
 FINT lda=a.ncols();
 FINT n=a.nrows();
-char uplo=upper?'U':'L';
+char uplo= LAPACK_FORTRANCASE(upper?'u':'l');
 FINT info;
 a.copyonwrite();
 FORNAME(dpotrf)(&uplo, &n, a, &lda, &info);
@@ -1596,7 +1607,7 @@ void cholesky(NRMat<complex<double> > &a, bool upper)
 if(a.nrows()!=a.ncols()) laerror("matrix must be square in Cholesky");
 FINT lda=a.ncols();
 FINT n=a.nrows();
-char uplo=upper?'U':'L';
+char uplo= LAPACK_FORTRANCASE(upper?'u':'l');
 FINT info;
 a.copyonwrite();
 a.transposeme();//switch to Fortran order
@@ -1788,3 +1799,4 @@ if(nchange&1)//still adjust to get determinant=1
 }
 }//namespace
--- a/smat.cc
+++ b/smat.cc
@@ -28,11 +28,8 @@
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <errno.h>
 #include <unistd.h>
 extern "C" {
 	extern ssize_t read(int, void *, size_t);
 	extern ssize_t write(int, const void *, size_t);
 }
 namespace LA {
@@ -130,7 +127,7 @@ NRSMat<T> & NRSMat<T>::operator=(const T &a) {
 	NOT_GPU(*this);
 	copyonwrite();
 	memset(v, 0, NN2*sizeof(T));
-	for(register int i=0; i<nn; i++) v[i*(i+1)/2 + i] = a;
+	for(register int i=0; i<nn; i++) v[(size_t)i*(i+1)/2 + i] = a;
 	return *this;
 }
@@ -157,11 +154,11 @@ const T* NRSMat<T>::diagonalof(NRVec<T> &r, const bool divide, bool cache) const
 	if(divide){
 		for(register int i=0; i<nn; i++){
-			const T a = v[i*(i+1)/2+i];
+			const T a = v[(size_t)i*(i+1)/2+i];
 			if(a != 0.) r[i] /= a;
 		}
 	}else{
-	        for(register int i=0; i<nn; i++) r[i] = v[i*(i+1)/2+i];
+	        for(register int i=0; i<nn; i++) r[i] = v[(size_t)i*(i+1)/2+i];
 	}
 	return divide?NULL:&r[0];
@@ -178,7 +175,7 @@ const NRSMat<T> NRSMat<T>::operator-() const {
 	NOT_GPU(*this);
 	NRSMat<T> result(nn, getlocation());
-	for(register int i = 0; i<NN2; i++) result.v[i]= -v[i];
+	for(register size_t i = 0; i<NN2; i++) result.v[i]= -v[i];
 	return result;
 }
@@ -239,7 +236,7 @@ const T NRSMat<T>::trace() const {
 	NOT_GPU(*this);
 	T tmp = 0;
-	for(register int i=0; i<nn; i++) tmp += v[i*(i+1)/2+i];
+	for(register int i=0; i<nn; i++) tmp += v[(size_t)i*(i+1)/2+i];
 	return tmp;
 }
@@ -251,7 +248,7 @@ template<>
 void NRSMat<double>::randomize(const double &x) {
 	NOT_GPU(*this);
-	for(int i=0; i<NN2; ++i){
+	for(size_t i=0; i<NN2; ++i){
 		v[i] = x*(2.*random()/(1.+RAND_MAX) -1.);
 	}
 }
@@ -262,11 +259,11 @@ void NRSMat<double>::randomize(const double &x) {
 ******************************************************************************/
 template<>
 void NRSMat<complex<double> >::randomize(const double &x) {
-	for(register int i=0; i<NN2; ++i) v[i].real() = x*(2.*random()/(1. + RAND_MAX) -1.);
+	for(register size_t i=0; i<NN2; ++i) v[i].real() = x*(2.*random()/(1. + RAND_MAX) -1.);
-	for(register int i=0; i<NN2; ++i) v[i].imag() = x*(2.*random()/(1. + RAND_MAX) -1.);
+	for(register size_t i=0; i<NN2; ++i) v[i].imag() = x*(2.*random()/(1. + RAND_MAX) -1.);
 	for(register int i=0; i<nn; ++i){
 		for(register int j=0; j<=i; ++j){
-		       	if(i == j) v[i*(i+1)/2+j].imag() = 0; //hermitean
+		       	if(i == j) v[i*(size_t)(i+1)/2+j].imag() = 0; //hermitean
 		}
 	}
 }
--- a/smat.h
+++ b/smat.h
@@ -25,7 +25,7 @@
 #include "la_traits.h"
 namespace LA {
-#define NN2 (nn*(nn+1)/2)
+#define NN2 ((size_t)nn*(nn+1)/2)
 /***************************************************************************//**
@@ -134,15 +134,15 @@ public:
 	void gemv(const T beta, NRVec<T> &r, const char trans, const T alpha, const NRVec<T> &x) const {r.gemv(beta,*this,trans,alpha,x);};
 	void gemv(const T beta, NRVec<complex<T> > &r, const char trans, const T alpha, const NRVec<complex<T> > &x) const {r.gemv(beta,*this,trans,alpha,x);};
-	inline const T& operator[](const int ij) const;
+	inline const T& operator[](const size_t ij) const;
-	inline T& operator[](const int ij);
+	inline T& operator[](const size_t ij);
 	inline const T& operator()(const int i, const int j) const;
 	inline T& operator()(const int i, const int j);
 	inline int nrows() const;
 	inline int ncols() const;
-	inline int size() const;
+	inline size_t size() const;
 	inline bool transp(const int i, const int j) const {return i>j;} //this can be used for compact storage of matrices, which are actually not symmetric, but one triangle of them is redundant
 	const typename LA_traits<T>::normtype norm(const T scalar = (T)0) const;
@@ -155,9 +155,9 @@ public:
 	void get(int fd, bool dimensions = 1, bool transp = 0);
        void put(int fd, bool dimensions = 1, bool transp = 0) const;
-	void copyonwrite();
+	void copyonwrite(bool detachonly=false);
-	void clear() {copyonwrite(); LA_traits<T>::clear(v,NN2);}; //zero out
+	void clear() {copyonwrite(true); LA_traits<T>::clear(v,NN2);}; //zero out
 	void resize(const int n);
 	void dealloc(void) {resize(0);}
@@ -212,7 +212,7 @@ inline NRSMat<T>::NRSMat(const T& a, const int n) : nn(n), count(new int(1)) {
 	if(location == cpu){
 #endif
 		v = new T[NN2];
-		if(a != (T)0) for(register int i = 0; i<NN2; i++) v[i] = a;
+		if(a != (T)0) for(register size_t i = 0; i<NN2; i++) v[i] = a;
 		else memset(v, 0, NN2*sizeof(T));
 #ifdef CUDALA
@@ -338,7 +338,7 @@ inline NRSMat<T> & NRSMat<T>::operator*=(const T &a) {
 	NOT_GPU(*this);
 	copyonwrite();
-	for(register int i = 0; i<NN2; ++i) v[i] *= a;
+	for(register size_t i = 0; i<NN2; ++i) v[i] *= a;
        return *this;
 }
@@ -353,7 +353,7 @@ inline NRSMat<T> & NRSMat<T>::operator+=(const T &a) {
 	NOT_GPU(*this);
 	copyonwrite();
-	for(register int i = 0; i<nn; i++) v[i*(i+1)/2 + i] += a;
+	for(register int i = 0; i<nn; i++) v[i*(size_t)(i+1)/2 + i] += a;
 	return *this;
 }
@@ -368,7 +368,7 @@ inline NRSMat<T> & NRSMat<T>::operator-=(const T &a) {
 	NOT_GPU(*this);
 	copyonwrite();
-	for(register int i = 0; i<nn; i++) v[i*(i+1)/2+i] -= a;
+	for(register int i = 0; i<nn; i++) v[i*(size_t)(i+1)/2+i] -= a;
 	return *this;
 }
@@ -438,7 +438,7 @@ inline NRSMat<T>& NRSMat<T>::operator+=(const NRSMat<T>& rhs) {
 	SAME_LOC(*this, rhs);
        copyonwrite();
-	for(register int i = 0; i<NN2; ++i) v[i] += rhs.v[i];
+	for(register size_t i = 0; i<NN2; ++i) v[i] += rhs.v[i];
        return *this;
 }
@@ -507,7 +507,7 @@ inline NRSMat<T>& NRSMat<T>::operator-=(const NRSMat<T>& rhs) {
 	NOT_GPU(*this);
        copyonwrite();
-        for(register int i = 0; i<NN2; ++i) v[i] -= rhs.v[i];
+        for(register size_t i = 0; i<NN2; ++i) v[i] -= rhs.v[i];
        return *this;
 }
@@ -540,7 +540,7 @@ inline const NRMat<T> NRSMat<T>::operator-(const NRMat<T> &rhs) const {
 * @return reference to the corresponding matrix element
 ******************************************************************************/
 template <typename T>
-inline T& NRSMat<T>::operator[](const int ij) {
+inline T& NRSMat<T>::operator[](const size_t ij) {
 #ifdef DEBUG
 	if(_LA_count_check && *count != 1) laerror("T& NRSMat<T>::operator[] used for matrix with count>1");
 	if(ij<0 || ij>=NN2) laerror("T& NRSMat<T>::operator[] out of range");
@@ -560,7 +560,7 @@ inline T& NRSMat<T>::operator[](const int ij) {
 * @return constant reference to the corresponding matrix element
 ******************************************************************************/
 template <typename T>
-inline const T & NRSMat<T>::operator[](const int ij) const {
+inline const T & NRSMat<T>::operator[](const size_t ij) const {
 #ifdef DEBUG
 	if(ij<0 || ij>=NN2) laerror("T& NRSMat<T>::operator[] out of range");
 	if(!v) laerror("T& NRSMat<T>::operator[] used for unallocated NRSmat<T> object");
@@ -578,8 +578,8 @@ inline const T & NRSMat<T>::operator[](const int ij) const {
 * @return cumulative index
 ******************************************************************************/
 template<typename T>
-inline T SMat_index(T i, T j) {
+inline size_t SMat_index(T i, T j) {
-	return (i>=j) ? i*(i+1)/2+j : j*(j+1)/2+i;
+	return (i>=j) ? i*(size_t)(i+1)/2+j : j*(size_t)(j+1)/2+i;
 }
 /***************************************************************************//**
@@ -590,8 +590,8 @@ inline T SMat_index(T i, T j) {
 * @return cumulative index
 ******************************************************************************/
 template<typename T>
-inline T SMat_index_igej(T i, T j) {
+inline size_t SMat_index_igej(T i, T j) {
-	return i*(i+1)/2+j;
+	return i*(size_t)(i+1)/2+j;
 }
 /***************************************************************************//**
@@ -602,8 +602,8 @@ inline T SMat_index_igej(T i, T j) {
 * @return cumulative index
 ******************************************************************************/
 template<typename T>
-inline T SMat_index_ilej(T i, T j) {
+inline size_t SMat_index_ilej(T i, T j) {
-	return j*(j+1)/2+i;
+	return j*(size_t)(j+1)/2+i;
 }
 /***************************************************************************//**
@@ -614,8 +614,8 @@ inline T SMat_index_ilej(T i, T j) {
 * @return cumulative index
 ******************************************************************************/
 template<typename T>
-inline T SMat_index_1(T i, T j) {
+inline size_t SMat_index_1(T i, T j) {
-	return (i>=j)? i*(i-1)/2+j-1 : j*(j-1)/2+i-1;
+	return (i>=j)? i*(size_t)(i-1)/2+j-1 : j*(size_t)(j-1)/2+i-1;
 }
 /***************************************************************************//**
@@ -626,8 +626,8 @@ inline T SMat_index_1(T i, T j) {
 * @return cumulative index
 ******************************************************************************/
 template<typename T>
-inline T SMat_index_1igej(T i, T j) {
+inline size_t SMat_index_1igej(T i, T j) {
-	return i*(i-1)/2+j-1;
+	return i*(size_t)(i-1)/2+j-1;
 }
 /***************************************************************************//**
@@ -638,21 +638,21 @@ inline T SMat_index_1igej(T i, T j) {
 * @return cumulative index
 ******************************************************************************/
 template<typename T>
-inline T SMat_index_1ilej(T i, T j) {
+inline size_t SMat_index_1ilej(T i, T j) {
-	return j*(j-1)/2+i-1;
+	return j*(size_t)(j-1)/2+i-1;
 }
 //indexing for antisymmetric matrix (used by fourindex)
 template<typename T>
-inline T ASMat_index(T i, T j)
+inline size_t ASMat_index(T i, T j)
 {
 if(i == j) return -1;
-return (i>j) ? i*(i-1)/2+j : j*(j-1)/2+i;
+return (i>j) ? i*(size_t)(i-1)/2+j : j*(size_t)(j-1)/2+i;
 }
 template<typename T>
-inline T ASMat_index_1(T i, T j)
+inline size_t ASMat_index_1(T i, T j)
 {
 if(i == j) return -1;
 return (i>j)? (i-2)*(i-1)/2+j-1 : (j-2)*(j-1)/2+i-1;
@@ -715,7 +715,7 @@ inline int NRSMat<T>::ncols() const {
 * @return number of elements of this symmetric matrix of generalt type <code>T</code>
 ******************************************************************************/
 template <typename T>
-inline int NRSMat<T>::size() const {
+inline size_t NRSMat<T>::size() const {
        return NN2;
 }
@@ -758,7 +758,7 @@ inline const double NRSMat<double>::amin() const {
 		double val(0.0);
 		int index(-1);
 		ret = std::numeric_limits<double>::max();
-		for(register int i = 0; i < NN2; i++){
+		for(register size_t i = 0; i < NN2; i++){
 			val = std::abs(v[i]);
 			if(val < ret){ index = i; ret = val; }
 		}
@@ -812,7 +812,7 @@ inline const complex<double> NRSMat<complex<double> >::amin() const{
 		complex<double> z_val(0.0, 0.0);
 		min_val = std::numeric_limits<double>::max();
-		for(register int i = 0; i < NN2; i++){
+		for(register size_t i = 0; i < NN2; i++){
 			z_val = v[i];
 			val = std::abs(z_val.real()) + std::abs(z_val.imag());
 			if(val < min_val){ index = i; min_val = val; }	
@@ -920,7 +920,7 @@ NRSMat<T> & NRSMat<T>::operator=(const NRSMat<T> & rhs) {
 * @see NRSMat<T>::operator|=, NRSMat<T>::copyonwrite()
 ******************************************************************************/
 template <typename T>
-void NRSMat<T>::copyonwrite() {
+void NRSMat<T>::copyonwrite(bool detachonly) {
 	if(!count) laerror("calling NRSMat<T>::copyonwrite() for undefined NRSMat<T> object");
 	if(*count > 1){
 		(*count)--;
@@ -931,12 +931,12 @@ void NRSMat<T>::copyonwrite() {
 		if(location == cpu) {
 #endif
 			newv = new T[NN2];
-			memcpy(newv, v, NN2*sizeof(T));
+			if(!detachonly) memcpy(newv, v, NN2*sizeof(T));
 #ifdef CUDALA
 		}else{
 			newv = (T *) gpualloc(NN2*sizeof(T));
 			if(sizeof(T)%sizeof(float) != 0) laerror("memory alignment problem in NRSMat<T>::copyonwrite()");
-			cublasScopy(NN2*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
+			if(!detachonly) cublasScopy(NN2*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
 			TEST_CUBLAS("cublasScopy");//"NRSMat<T>::copyonwrite()"
 		}
 #endif
--- a/sparsemat.h
+++ b/sparsemat.h
@@ -132,8 +132,8 @@ public:
 	void setunsymmetric();//unwind the matrix assuming it was indeed symmetric
 	inline bool issymmetric() const {return symmetric;}
 	unsigned int length() const;
-	void copyonwrite();
+	void copyonwrite(bool detachonly=false);
-	void clear() {copyonwrite();unsort();deletelist();}
+	void clear() {copyonwrite(true); if(count) {delete count; count=NULL;}}
 	void simplify();
 	const T trace() const;
 	const typename LA_traits<T>::normtype norm(const T scalar=(T)0) const; //is const only mathematically, not in internal implementation - we have to simplify first
--- a/t.cc
+++ b/t.cc
@@ -1799,6 +1799,11 @@ for(int i=0; i<m.nrows(); ++i) for(int j=0; j<m.nrows(); ++j)
 	if(i!=j) err += abs(m2(i,j));
 cout <<"offdiagonality error = "<<err<<endl;
 err=0; 
 for(int i=0; i<m.nrows(); ++i) err += abs(m2(i,i) - eivals[i]);
 cout <<"eigenvalue error = "<<err<<endl;
 //test as general matrix
 NRVec<complex<double> > ww(m.nrows());
 NRMat<complex<double> > vl(m.nrows(),m.nrows()), vr(m.nrows(),m.nrows());
@@ -1812,6 +1817,11 @@ for(int i=0; i<m.nrows(); ++i) for(int j=0; j<m.nrows(); ++j)
        if(i!=j) err += abs(m3(i,j));
 cout <<"offdiagonality error 2 = "<<err<<endl;
 err=0;
 for(int i=0; i<m.nrows(); ++i) err += abs(m3(i,i) - ww[i]);
 cout <<"eigenvalue error = "<<err<<endl;
 }
--- a/vec.cc
+++ b/vec.cc
@@ -27,11 +27,8 @@
 #include <errno.h>
 #include "vec.h"
 #include "qsort.h"
 #include <unistd.h>
 extern "C" {
 	extern ssize_t read(int, void *, size_t);
 	extern ssize_t write(int, const void *, size_t);
 }
 namespace LA {
@@ -541,7 +538,7 @@ template<>
 void NRVec<double>::gemv(const double beta, const NRMat<double> &A, 
 		const char trans, const double alpha, const NRVec &x) {
 #ifdef DEBUG
-	if((trans == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
+	if((tolower(trans) == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
 #endif
 	SAME_LOC3(*this, x, A);
 	copyonwrite();
@@ -549,10 +546,10 @@ void NRVec<double>::gemv(const double beta, const NRMat<double> &A,
 #ifdef CUDALA
 	if(location==cpu){
 #endif
-		cblas_dgemv(CblasRowMajor, (trans=='n' ? CblasNoTrans:CblasTrans), A.nrows(), A.ncols(), alpha, A, A.ncols(), x.v, 1, beta, v, 1);
+		cblas_dgemv(CblasRowMajor, (tolower(trans)=='n' ? CblasNoTrans:CblasTrans), A.nrows(), A.ncols(), alpha, A, A.ncols(), x.v, 1, beta, v, 1);
 #ifdef CUDALA
 	}else{
-		cublasDgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), x.v, 1, beta, v, 1);
+		cublasDgemv((tolower(trans)=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), x.v, 1, beta, v, 1);
 		TEST_CUBLAS("cublasDgemv");
 	}
 #endif
@@ -572,7 +569,7 @@ template<>
 void NRVec<complex<double> >::gemv(const double beta, const NRMat<double> &A,
                const char trans, const double alpha, const NRVec<complex<double> > &x) {
 #ifdef DEBUG
-	if ((trans == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
+	if ((tolower(trans) == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
 #endif
 	SAME_LOC3(*this, x, A);
 	copyonwrite();
@@ -580,16 +577,16 @@ void NRVec<complex<double> >::gemv(const double beta, const NRMat<double> &A,
 #ifdef CUDALA
 	if(location==cpu){
 #endif
-		cblas_dgemv(CblasRowMajor, (trans=='n'?CblasNoTrans:CblasTrans),
+		cblas_dgemv(CblasRowMajor, (tolower(trans)=='n'?CblasNoTrans:CblasTrans),
 				A.nrows(), A.ncols(), alpha, A, A.ncols(), (double *)x.v, 2, beta, (double *)v, 2);
-		cblas_dgemv(CblasRowMajor, (trans=='n'?CblasNoTrans:CblasTrans),
+		cblas_dgemv(CblasRowMajor, (tolower(trans)=='n'?CblasNoTrans:CblasTrans),
 				A.nrows(), A.ncols(), alpha, A, A.ncols(), ((double *)x.v) + 1, 2, beta, ((double *)v)+1, 2);
 #ifdef CUDALA
 	}else{
-		cublasDgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), (double*)(x.v), 2, beta, (double *)v, 2);
+		cublasDgemv((tolower(trans)=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), (double*)(x.v), 2, beta, (double *)v, 2);
 		TEST_CUBLAS("cublasDgemv");
-		cublasDgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), ((double *)x.v) + 1, 2, beta, ((double *)v)+1, 2);
+		cublasDgemv((tolower(trans)=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), ((double *)x.v) + 1, 2, beta, ((double *)v)+1, 2);
 		TEST_CUBLAS("cublasDgemv");
 	}
 #endif
@@ -611,14 +608,14 @@ void NRVec<complex<double> >::gemv(const complex<double> beta,
 		const NRMat<complex<double> > &A, const char trans, 
 		const complex<double> alpha, const NRVec<complex<double> > &x) {
 #ifdef DEBUG
-	if ((trans == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
+	if ((tolower(trans) == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
 #endif
 	SAME_LOC3(*this, x, A);
 	copyonwrite();
 #ifdef CUDALA
 	if(location == cpu){
 #endif
-		cblas_zgemv(CblasRowMajor, (trans=='n'?CblasNoTrans:CblasTrans), 
+		cblas_zgemv(CblasRowMajor, (tolower(trans)=='n'?CblasNoTrans:CblasTrans), 
 				A.nrows(), A.ncols(), &alpha, A, A.ncols(), x.v, 1, &beta, v, 1);
 #ifdef CUDALA
 	}else{
@@ -626,7 +623,7 @@ void NRVec<complex<double> >::gemv(const complex<double> beta,
 		const cuDoubleComplex _alpha = make_cuDoubleComplex(alpha.real(), alpha.imag());
 		const cuDoubleComplex  _beta = make_cuDoubleComplex(beta.real(), beta.imag());
-		cublasZgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(),
+		cublasZgemv((tolower(trans)=='n'?'T':'N'), A.ncols(), A.nrows(),
 				_alpha, (cuDoubleComplex*)(A[0]), A.ncols(), (cuDoubleComplex*)(x.v), 1, _beta, (cuDoubleComplex*)v, 1);
 		TEST_CUBLAS("cublasZgemv");
 	}
--- a/vec.h
+++ b/vec.h
@@ -149,10 +149,10 @@ public:
 	#endif
 	//! create separate copy of the data corresponding to this vector 
-	void copyonwrite();
+	void copyonwrite(bool detachonly=false);
 	//! purge this vector 
-	void clear() { copyonwrite(); LA_traits<T>::clear(v, nn); };
+	void clear() { copyonwrite(true); LA_traits<T>::clear(v, nn); };
 	//! assignment operator assigns given vector
 	NRVec& operator=(const NRVec &rhs);
@@ -806,7 +806,7 @@ NRVec<T>::~NRVec() {
 * make own copy of the underlying data connected with this vector
 ******************************************************************************/
 template <typename T>
-void NRVec<T>::copyonwrite() {
+void NRVec<T>::copyonwrite(bool detachonly) {
 	if(!count) laerror("copyonwrite of an undefined vector");
 	if(*count > 1) {
 		(*count)--;
@@ -817,12 +817,12 @@ void NRVec<T>::copyonwrite() {
 		if(location == cpu){
 #endif
 			newv = new T[nn];
-			memcpy(newv, v, nn*sizeof(T));
+			if(!detachonly) memcpy(newv, v, nn*sizeof(T));
 #ifdef CUDALA
 		}else{
 			newv = (T *) gpualloc(nn*sizeof(T));
 			if(sizeof(T)%sizeof(float) != 0) laerror("memory alignment problem in NRVec<T>::copyonwrite()");
-			cublasScopy(nn*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
+			if(!detachonly) cublasScopy(nn*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
 			TEST_CUBLAS("cublasScopy");//"NRVec<T>::copyonwrite()"
 		}
 #endif