diff --git a/ChangeLog b/ChangeLog
index f3ef84f..538c841 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,7 +1,33 @@
+04.11.2013 added conjugateme() for corder=1 in complex diagonalize() and
+gdiagonalize() to get correct eigenvector convention for C-storage
+30.10.2013 macros for consistent lowercase and uppercase of character
+parameters for case sensitive BLAS and LAPACK; case-insesitivness of these in vec.cc and mat.cc (cublas still not treated)
+29.10.2013 included la_traits.h in nonclass.c to get correct extern "C" of cblas
+14.10.2013 added operator% to bivector
+01.06.2012 more efficient clear() by detachonly parameter to copyonwrite()
+14.03.2012 fixed overflow in product of dimensions of NRMat and NRSMat
+13.03.2012 symmetry of integrals with different spins added to fourindex.h
+23.02.2012 included unistd.h 
+23.02.2012 fixed max read/write size in multiget and multiput to 1GB
+24.01.2012 Improved DIIS (L.V.)
+19.01.2012 Fixed location for gpu in nonclass.cc
+02.06.2011 In oplus() =0 replaced by clear() to work on non-square matrices (J.P.)
+01.02.2011 Added trace2 for complex matrices by L. Veis
+18.01.2011 Minor bugfixes and compatibility with Intel C++ compiler by Roman Curik
+28.12.2010 Generalized diagonalization and functions of general complex matrices 
+21.12.2010 Changed to size_t in matrix put,get to prevent overflow
+08.12.2010 Deallocate method added to LA_traits, used for memory saving in matrix exp
+24.11.2010 Added checking for integer overflow in exptimes
+23.10.2010 Fixed dependency on atlas for static libraries in configure.ac 
+27.09.2010 Seed of CSRMat class added.
+22.09.2010 Allowed formal operator[] on gpu matrices
+22.09.2010 Added submatrix() to SparseSMat
+22.09.2010 Zgerc and zgeru implemented in non-cblas version
+21.09.2010 Fixed a bug in laerror macro causing sometimes compilation problems
 08.09.2010 RELEASE 0.6
-08.09.2010 Doxygen documentation by M. Sulc
+08.09.2010 Doxygen documentation for  NRVec, NRMat, NRSMat classes contributed by M. Sulc
 08.09.2010 Extended CUBLAS support for NRVec, NRMat, NRSMat classes contributed by M. Sulc
-08.09.2010 Minor bugfixes contributed by M. Sulc
+08.09.2010 Minor bugfixes and improvements contributed by M. Sulc
 25.06.2010 Added proof-of-concept CUBLAS support for NRVec, NRMat, NRSMat 
 24.06.2010 Fixed a memory leak existing when MATPTR  was defined
 18.06.2010 added autoconf support for BLAS+LAPACK compiled with 64-bit integers and for CUBLAS
@@ -60,3 +86,4 @@ xx.08.2008 fixed wrong permutation symmetry in previously unused (and untested)
 11.03.2008 added cblas_idamax replacement for non-cblas
 05.03.2008 fixed transposed bug in inverse() with non-cblas
 26.02.2008 INITIAL RELEASE 0.1
+
diff --git a/bitvector.cc b/bitvector.cc
index d61c0b9..0a2c779 100644
--- a/bitvector.cc
+++ b/bitvector.cc
@@ -130,7 +130,7 @@ return *this;
 
 /*number of ones in a binary number, from "Hacker's delight" book*/
 #ifdef LONG_IS_32 
-static unsigned long word_popul(unsigned long x) 
+static unsigned int word_popul(unsigned long x) 
 {
 x -= ((x>>1)&0x55555555);
 x = (x&0x33333333) + ((x>>2)&0x33333333);
@@ -140,9 +140,10 @@ x+= (x>>16);
 return x&0x3f;
 }
 #else
-static unsigned long word_popul(unsigned long x)
+//@@@@ use an efficient trick
+static unsigned int word_popul(unsigned long x)
 {
-unsigned long s=0;
+unsigned int s=0;
 for(int i=0; i<64; ++i)
         {
         if(x&1) ++s;
@@ -156,7 +157,6 @@ return s;
 
 unsigned int bitvector::population(const unsigned int before) const
 {
-//@@@before
 int i;
 unsigned int s=0;
 for(i=0; i<nn-1; ++i) s+=word_popul(v[i]);
@@ -170,4 +170,21 @@ if(modulo)
 return s+word_popul(a);
 }
 
+unsigned int bitvector::operator%(const bitvector &y) const
+{
+if(nn!=y.nn) laerror("incompatible size in bitdifference");
+
+unsigned int s=0;
+for(int i=0; i<nn-1; ++i) s+=word_popul(v[i]^y.v[i]);
+bitvector_block a=v[nn-1]^y.v[nn-1];
+if(modulo)
+        {
+        bitvector_block mask= ~((bitvector_block)0);
+        mask <<=modulo;
+        a &= ~mask;
+        }
+return s+word_popul(a);
+}
+
+
 }//namespace
diff --git a/bitvector.h b/bitvector.h
index bdcfa9c..70cf352 100644
--- a/bitvector.h
+++ b/bitvector.h
@@ -55,7 +55,7 @@ public:
 	void reset(const unsigned int i) {v[i/blockbits] &= ~(1<<(i%blockbits));};
 	const bool get(const unsigned int i) {return (v[i/blockbits] >>(i%blockbits))&1;};
 	const bool assign(const unsigned int i, const bool r) {if(r) set(i); else reset(i); return r;};
-	void clear() {copyonwrite(); memset(v,0,nn*sizeof(bitvector_block));};
+	void clear() {copyonwrite(true); memset(v,0,nn*sizeof(bitvector_block));};
 	void fill() {memset(v,0xff,nn*sizeof(bitvector_block));};
 	bool operator!=(const bitvector &rhs) const;
 	bool operator==(const bitvector &rhs) const {return !(*this != rhs);};
@@ -70,13 +70,13 @@ public:
 	bitvector operator&(const bitvector &rhs) const {return bitvector(*this) &= rhs;};
 	bitvector operator|(const bitvector &rhs) const {return bitvector(*this) |= rhs;};
 	bitvector operator^(const bitvector &rhs) const {return bitvector(*this) ^= rhs;};
-	unsigned int population(const unsigned int before=0) const; //@@@number of 1's
+        unsigned int operator%(const bitvector &y) const; //number of differing bits
+	unsigned int population(const unsigned int before=0) const; //number of 1's
 	//extended, truncated const i.e. not on *this but return new entity, take care of modulo's bits
 	//logical shifts  <<= >>= << >> not implemented yet
 	//logical rotations not implemented yet
 	};
 
-
 extern std::ostream & operator<<(std::ostream &s, const bitvector &x);
 extern std::istream & operator>>(std::istream  &s, bitvector &x);
 
diff --git a/csrmat.cc b/csrmat.cc
index 4682ebd..4f319ce 100644
--- a/csrmat.cc
+++ b/csrmat.cc
@@ -52,8 +52,9 @@ INSTANTIZE(complex<double>)
 */
 
 //// forced instantization of functions in the header in the corresponding object file
-template class CSRMat<double>;
-template class CSRMat<complex<double> >;
+//@@@@@template class CSRMat<double>;
+//@@@@template class CSRMat<complex<double> >;
+//@@@@ unfinished class commented out
 
 
 
diff --git a/fortran.h b/fortran.h
index b91e11d..e2a3dc6 100644
--- a/fortran.h
+++ b/fortran.h
@@ -11,3 +11,6 @@
 #undef FORINT
 #define FINT int
 #endif
+
+#define BLAS_FORTRANCASE(x) toupper(x)
+#define LAPACK_FORTRANCASE(x) toupper(x)
diff --git a/fourindex.h b/fourindex.h
index 3b490e4..78ab995 100644
--- a/fourindex.h
+++ b/fourindex.h
@@ -104,10 +104,10 @@ __attribute__((packed))
 
 
 //later add symmetry of complex integrals
-typedef enum {undefined_symmetry=-1,nosymmetry=0, twoelectronrealmullikan=1, twoelectronrealdirac=2, T2ijab_aces=3, antisymtwoelectronrealdirac=4, T2IjAb_aces=5} fourindexsymtype; //only permutation-nonequivalent elements are stored
+typedef enum {undefined_symmetry=-1,nosymmetry=0, twoelectronrealmullikan=1, twoelectronrealdirac=2, T2ijab_aces=3, antisymtwoelectronrealdirac=4, T2IjAb_aces=5, twoelectronrealmullikanAB=6 } fourindexsymtype; //only permutation-nonequivalent elements are stored
 // these should actually be static private members of the fourindex class, but leads to an ICE on gcc3.2
-static const int fourindex_n_symmetrytypes=6;
-static const int fourindex_permnumbers[fourindex_n_symmetrytypes]={1,8,8,4,8,8};
+static const int fourindex_n_symmetrytypes=7;
+static const int fourindex_permnumbers[fourindex_n_symmetrytypes]={1,8,8,4,8,8,4};
 static const int fourindex_permutations[fourindex_n_symmetrytypes][8][5]=
 		{
 		{{0,1,2,3,1}},
@@ -116,6 +116,7 @@ static const int fourindex_permutations[fourindex_n_symmetrytypes][8][5]=
 		{{0,1,2,3,1},{1,0,2,3,-1},{0,1,3,2,-1},{1,0,3,2,1}},
 		{{0,1,2,3,1},{1,0,2,3,-1},{0,1,3,2,-1},{1,0,3,2,1},{2,3,0,1,1},{3,2,0,1,-1},{2,3,1,0,-1},{3,2,1,0,1}},
 		{{0,1,2,3,1}}, //T2IjAb_aces is like nosymmetry but different index ranges
+		{{0,1,2,3,1},{1,0,2,3,1},{0,1,3,2,1},{1,0,3,2,1}},
 		};
 
 
@@ -669,6 +670,108 @@ typedef typename LA_traits<T>::normtype normtype;
 
 //make it as a derived class in order to be able to use it in a base class context - "supermatrix" operations
 template<class T, class I> 
+class fourindex_dense<twoelectronrealmullikanAB,T,I> : public NRMat<T> {
+public:
+	fourindex_dense(): NRMat<T>() {};
+	explicit fourindex_dense(const int n): NRMat<T>(n*(n+1)/2,n*(n+1)/2) {};
+	fourindex_dense(const NRMat<T> &rhs): NRMat<T>(rhs) {}; //be able to convert the parent class  transparently to this
+	fourindex_dense(const T &a, const int n): NRMat<T>(a,n*(n+1)/2,n*(n+1)/2) {};
+	fourindex_dense(const T *a, const int n): NRMat<T>(a,n*(n+1)/2,n*(n+1)/2) {};
+	//and also construct it from sparse and externally stored fourindex classes
+	//it seems not possible to nest template<class I> just for the two constructors
+	fourindex_dense(const fourindex<I,T> &rhs);
+	fourindex_dense(const fourindex_ext<I,T> &rhs);
+
+	T& operator() (unsigned int i, unsigned int j, unsigned int k, unsigned int l);
+	const T& operator() (unsigned int i, unsigned int j, unsigned int k, unsigned int l) const;
+	void resize(const int n) {(*this).NRMat<T>::resize(n*(n+1)/2,n*(n+1)/2);};
+	void putext(int f, T thr=1e-15);
+	int nbas() const {return (int)std::sqrt(2*(*this).nrows());};
+
+};
+
+template<class T, class I>
+void fourindex_dense<twoelectronrealmullikanAB,T,I>::putext(int f, T thr)
+{
+T y;
+for(int i=1; i<=nbas(); ++i) for(int j=1; j<=i; ++j)
+        for(int k=1; k<=nbas(); ++k) for(int l=1; l<=k; ++l)
+		if((y=abs((*this)(i,j,k,l))) > thr)
+			{
+			matel4stored<I,T> x;
+			x.elem= y;
+			x.index.indiv.i=i;
+			x.index.indiv.j=j;
+			x.index.indiv.k=k;
+			x.index.indiv.l=l;
+			if(sizeof(matel4stored<I,T>) != write(f,&x,sizeof(matel4stored<I,T>)) )
+				laerror("write error in putext");
+			}
+}
+
+
+template<class T, class I> 
+fourindex_dense<twoelectronrealmullikanAB,T,I>::fourindex_dense(const fourindex<I,T> &rhs) : NRMat<T>((T)0,rhs.size()*(rhs.size()+1)/2,rhs.size()*(rhs.size()+1)/2)
+{
+if(rhs.getsymmetry() != twoelectronrealmullikanAB ) laerror("fourindex_dense symmetry mismatch");
+typename fourindex<I,T>::iterator p;
+#ifdef DEBUG
+unsigned int IJ = SMat_index_1(p->index.indiv.i,p->index.indiv.j);
+unsigned int KL = SMat_index_1(p->index.indiv.k,p->index.indiv.l);
+if (IJ<0 || IJ>=(unsigned int)NRMat<T>::nn || KL<0 || KL>=(unsigned int)NRMat<T>::mm) laerror("fourindex_dense index out of range in constructor");
+#endif
+for(p=rhs.begin(); p!= rhs.end(); ++p) (*this)(p->index.indiv.i,p->index.indiv.j,p->index.indiv.k,p->index.indiv.l) = p->elem;
+}
+
+template<class T, class I>
+fourindex_dense<twoelectronrealmullikanAB,T,I>::fourindex_dense(const fourindex_ext<I,T> &rhs) : NRMat<T>((T)0,rhs.size()*(rhs.size()+1)/2,rhs.size()*(rhs.size()+1)/2)
+{
+if(rhs.getsymmetry() != twoelectronrealmullikanAB ) laerror("fourindex_dense symmetry mismatch");
+typename fourindex_ext<I,T>::iterator p;
+for(p=rhs.begin(); p!= rhs.end(); ++p) 
+	{
+#ifdef DEBUG
+unsigned int IJ = SMat_index_1(p->index.indiv.i,p->index.indiv.j);
+unsigned int KL = SMat_index_1(p->index.indiv.k,p->index.indiv.l);
+if (IJ<0 || IJ>=(unsigned int)NRMat<T>::nn || KL<0 || KL>=(unsigned int)NRMat<T>::mm) laerror("fourindex_dense index out of range in constructor");
+#endif
+	(*this)(p->index.indiv.i,p->index.indiv.j ,p->index.indiv.k,p->index.indiv.l) = p->elem;
+	}
+}
+
+
+
+template<class T, class DUMMY>
+T& fourindex_dense<twoelectronrealmullikanAB,T,DUMMY>::operator() (unsigned int i, unsigned int j, unsigned int k, unsigned int l)
+{
+int I = SMat_index_1(i,j);
+int J = SMat_index_1(k,l);
+//I,J act as indices of a NRmat
+#ifdef DEBUG
+     	if (*NRMat<T>::count != 1) laerror("lval (i,j,k,l) with count > 1 in fourindex_dense");
+       	if (I<0 || I>=NRMat<T>::nn || J<0 || J>=NRMat<T>::mm) laerror("fourindex_dense index out of range");
+       	if (!NRMat<T>::v) laerror("access to unallocated fourindex_dense");
+#endif
+return NRMat<T>::operator()(I,J);
+}
+
+
+template<class T, class DUMMY>
+const T& fourindex_dense<twoelectronrealmullikanAB,T,DUMMY>::operator() (unsigned int i, unsigned int j, unsigned int k, unsigned int l) const
+{
+int I = SMat_index_1(i,j);
+int J = SMat_index_1(k,l);
+//I,J act as indices of a NRSmat
+#ifdef DEBUG
+       	if (I<0 || I>=NRMat<T>::nn || J<0 || J>=NRMat<T>::mm) laerror("fourindex_dense index out of range");
+       	if (!NRMat<T>::v) laerror("access to unallocated fourindex_dense");
+#endif
+return NRMat<T>::operator()(I,J);
+}
+
+
+////////////////////
+template<class T, class I> 
 class fourindex_dense<twoelectronrealmullikan,T,I> : public NRSMat<T> {
 public:
 	fourindex_dense(): NRSMat<T>() {};
diff --git a/la_traits.h b/la_traits.h
index 12dcaf5..23ceab9 100644
--- a/la_traits.h
+++ b/la_traits.h
@@ -35,6 +35,7 @@
 #include <fstream>
 #include <limits>
 #include <complex>
+#include <unistd.h>
 
 
 //using namespace std;
@@ -56,7 +57,7 @@ extern "C" {
 #include "noncblas.h"
 #else
 extern "C" {
-#include "clapack.h"
+#include "atlas/clapack.h"
 }
 #endif
 
@@ -213,7 +214,7 @@ typedef C normtype;
 typedef C realtype;
 typedef complex<C> complextype;
 static inline C sqrabs(const complex<C> x) { return x.real()*x.real()+x.imag()*x.imag();}
-static inline bool gencmp(const complex<C> *x, const complex<C> *y, int n) {return memcmp(x,y,n*sizeof(complex<C>));}
+static inline bool gencmp(const complex<C> *x, const complex<C> *y, size_t n) {return memcmp(x,y,n*sizeof(complex<C>));}
 static bool bigger(const  complex<C> &x, const complex<C> &y) {laerror("complex comparison undefined"); return false;}
 static bool smaller(const  complex<C> &x, const complex<C> &y) {laerror("complex comparison undefined"); return false;}
 static inline normtype norm (const  complex<C> &x) {return std::abs(x);}
@@ -225,9 +226,10 @@ static void multiget(size_t n,int fd, complex<C> *x, bool dimensions=0)
 	size_t total=0;
 	size_t system_limit = (1L<<30)/sizeof(complex<C>); //do not expect too much from the system and read at most 1GB at once
 	ssize_t r;
+	size_t nn;
 	do{
-		r=read(fd,x+total,(n-total > system_limit ? system_limit : n-total)*sizeof(complex<C>)); 
-		if(r<0 || r==0 && n!=0 ) {std::cout<<"read returned "<<r<<" perror "<<strerror(errno) <<std::endl; laerror("read error");}
+		r=read(fd,x+total,nn=(n-total > system_limit ? system_limit : n-total)*sizeof(complex<C>)); 
+		if(r<0 || r==0 && nn!=0 ) {std::cout<<"read returned "<<r<<" perror "<<strerror(errno) <<std::endl; laerror("read error");}
 		else total += r/sizeof(complex<C>);
 		if(r%sizeof(complex<C>)) laerror("read error 2");
 	  }
@@ -238,16 +240,17 @@ static void multiput(size_t n, int fd, const complex<C> *x, bool dimensions=0)
 	size_t total=0;
 	size_t system_limit = (1L<<30)/sizeof(complex<C>); //do not expect too much from the system and write at most 1GB at once
 	ssize_t r;
+	size_t nn;
 	do{
-		r=write(fd,x+total,(n-total > system_limit ? system_limit : n-total)*sizeof(complex<C>)); 
-		if(r<0 || r==0 && n!=0 ) {std::cout<<"write returned "<<r<<"  perror "<<strerror(errno) <<std::endl; laerror("write error");}
+		r=write(fd,x+total,nn=(n-total > system_limit ? system_limit : n-total)*sizeof(complex<C>)); 
+		if(r<0 || r==0 && nn!=0 ) {std::cout<<"write returned "<<r<<"  perror "<<strerror(errno) <<std::endl; laerror("write error");}
 		else total += r/sizeof(complex<C>);
 		if(r%sizeof(complex<C>)) laerror("write error 2");
 	  }
 	while(total < n);
 	}
-static void copy(complex<C> *dest, complex<C> *src, unsigned int n) {memcpy(dest,src,n*sizeof(complex<C>));}
-static void clear(complex<C> *dest, unsigned int n) {memset(dest,0,n*sizeof(complex<C>));}
+static void copy(complex<C> *dest, complex<C> *src, size_t n) {memcpy(dest,src,n*sizeof(complex<C>));}
+static void clear(complex<C> *dest, size_t n) {memset(dest,0,n*sizeof(complex<C>));}
 static void copyonwrite(complex<C> &x) {};
 static void clearme(complex<C> &x) {x=0;};
 static void deallocate(complex<C> &x) {};
@@ -266,7 +269,7 @@ typedef C normtype;
 typedef C realtype;
 typedef complex<C> complextype;
 static inline C sqrabs(const C x) { return x*x;}
-static inline bool gencmp(const C *x, const C *y, int n) {return memcmp(x,y,n*sizeof(C));}
+static inline bool gencmp(const C *x, const C *y, size_t n) {return memcmp(x,y,n*sizeof(C));}
 static inline bool bigger(const  C &x, const C &y) {return x>y;}
 static inline bool smaller(const  C &x, const C &y) {return x<y;}
 static inline normtype norm (const  C &x) {return std::abs(x);}
@@ -278,9 +281,10 @@ static void multiget(size_t n,int fd, C *x, bool dimensions=0)
 	size_t total=0;
 	size_t system_limit = (1L<<30)/sizeof(C); //do not expect too much from the system and read at most 1GB at once
 	ssize_t r;
+	size_t nn;
 	do{
-		r=read(fd,x+total,(n-total > system_limit ? system_limit : n-total)*sizeof(C)); 
-		if(r<0 || r==0 && n!=0 ) {std::cout<<"read returned "<<r<<std::endl; laerror("read error");}
+		r=read(fd,x+total,nn=(n-total > system_limit ? system_limit : n-total)*sizeof(C)); 
+		if(r<0 || r==0 && nn!=0 ) {std::cout<<"read returned "<<r<<" perror "<<strerror(errno) <<std::endl; laerror("read error");}
 		else total += r/sizeof(C);
 		if(r%sizeof(C)) laerror("read error 2");
 	  }
@@ -291,16 +295,17 @@ static void multiput(size_t n, int fd, const C *x, bool dimensions=0)
 	size_t total=0;
 	size_t system_limit = (1L<<30)/sizeof(C); //do not expect too much from the system and write at most 1GB at once
 	ssize_t r;
+	size_t nn;
 	do{
-		r=write(fd,x+total,(n-total > system_limit ? system_limit : n-total)*sizeof(C)); 
-		if(r<0 || r==0 && n!=0 ) {std::cout<<"write returned "<<r<<std::endl; laerror("write error");}
+		r=write(fd,x+total,nn=(n-total > system_limit ? system_limit : n-total)*sizeof(C)); 
+		if(r<0 || r==0 && nn!=0 ) {std::cout<<"write returned "<<r<<" perror "<<strerror(errno) <<std::endl; laerror("write error");}
 		else total += r/sizeof(C);
 		if(r%sizeof(C)) laerror("write error 2");
 	  }
 	while(total < n);
 	}
-static void copy(C *dest, C *src, unsigned int n) {memcpy(dest,src,n*sizeof(C));}
-static void clear(C *dest, unsigned int n) {memset(dest,0,n*sizeof(C));}
+static void copy(C *dest, C *src, size_t n) {memcpy(dest,src,n*sizeof(C));}
+static void clear(C *dest, size_t n) {memset(dest,0,n*sizeof(C));}
 static void copyonwrite(C &x) {};
 static void clearme(C &x) {x=0;};
 static void deallocate(C &x) {};
@@ -323,7 +328,7 @@ typedef X<C> producttype; \
 typedef typename LA_traits<C>::normtype normtype; \
 typedef X<typename LA_traits<C>::realtype> realtype; \
 typedef X<typename LA_traits<C>::complextype> complextype; \
-static bool gencmp(const C *x, const C *y, int n) {for(int i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \
+static bool gencmp(const C *x, const C *y, size_t n) {for(size_t i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \
 static inline bool bigger(const  C &x, const C &y) {return x>y;} \
 static inline bool smaller(const  C &x, const C &y) {return x<y;} \
 static inline normtype norm (const X<C> &x) {return x.norm();} \
@@ -332,8 +337,8 @@ static void put(int fd, const X<C> &x, bool dimensions=1, bool transp=0) {x.put(
 static void get(int fd, X<C> &x, bool dimensions=1, bool transp=0) {x.get(fd,dimensions,transp);} \
 static void multiput(size_t n,int fd, const X<C> *x, bool dimensions=1) {for(size_t i=0; i<n; ++i) x[i].put(fd,dimensions);} \
 static void multiget(size_t n,int fd, X<C> *x, bool dimensions=1) {for(size_t i=0; i<n; ++i) x[i].get(fd,dimensions);} \
-static void copy(C *dest, C *src, unsigned int n) {for(unsigned int i=0; i<n; ++i) dest[i]=src[i];} \
-static void clear(C *dest, unsigned int n) {for(unsigned int i=0; i<n; ++i) dest[i].clear();}\
+static void copy(C *dest, C *src, size_t n) {for(size_t i=0; i<n; ++i) dest[i]=src[i];} \
+static void clear(C *dest, size_t n) {for(size_t i=0; i<n; ++i) dest[i].clear();}\
 static void copyonwrite(X<C> &x) {x.copyonwrite();}\
 static void clearme(X<C> &x) {x.clear();}\
 static void deallocate(X<C> &x) {x.dealloc();}\
@@ -359,7 +364,7 @@ typedef NRMat<C> producttype;  \
 typedef typename LA_traits<C>::normtype normtype;  \
 typedef X<typename LA_traits<C>::realtype> realtype; \
 typedef X<typename LA_traits<C>::complextype> complextype; \
-static bool gencmp(const C *x, const C *y, int n) {for(int i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \
+static bool gencmp(const C *x, const C *y, size_t n) {for(size_t i=0; i<n; ++i) if(x[i]!=y[i]) return true; return false;} \
 static inline bool bigger(const  C &x, const C &y) {return x>y;} \
 static inline bool smaller(const  C &x, const C &y) {return x<y;} \
 static inline normtype norm (const X<C> &x) {return x.norm();}  \
@@ -368,8 +373,8 @@ static void put(int fd, const X<C> &x, bool dimensions=1, bool transp=0) {x.put(
 static void get(int fd, X<C> &x, bool dimensions=1, bool transp=0) {x.get(fd,dimensions);}  \
 static void multiput(size_t n,int fd, const X<C> *x, bool dimensions=1) {for(size_t i=0; i<n; ++i) x[i].put(fd,dimensions);}  \
 static void multiget(size_t n,int fd, X<C> *x, bool dimensions=1) {for(size_t i=0; i<n; ++i) x[i].get(fd,dimensions);}  \
-static void copy(C *dest, C *src, unsigned int n) {for(unsigned int i=0; i<n; ++i) dest[i]=src[i];}  \
-static void clear(C *dest, unsigned int n) {for(unsigned int i=0; i<n; ++i) dest[i].clear();} \
+static void copy(C *dest, C *src, size_t n) {for(size_t i=0; i<n; ++i) dest[i]=src[i];}  \
+static void clear(C *dest, size_t n) {for(size_t i=0; i<n; ++i) dest[i].clear();} \
 static void copyonwrite(X<C> &x) {x.copyonwrite();} \
 static void clearme(X<C> &x) {x.clear();} \
 static void deallocate(X<C> &x) {x.dealloc();} \
diff --git a/laerror.cc b/laerror.cc
index 99be356..5dd8816 100644
--- a/laerror.cc
+++ b/laerror.cc
@@ -87,7 +87,26 @@ extern "C" void ATL_xerbla(int p, char *rout, char *form, ...){
 	laerror(msg0);
 }
 
-int cblas_errprn(int ierr, int info, char *form, ...) {
+#ifndef NONCBLAS
+#include "cblas.h"
+#include <stdarg.h>
+extern "C" void cblas_xerbla(int p, const char *rout, const char *form, ...)
+{
+   va_list argptr;
+
+   va_start(argptr, form);
+   if (p)
+      {
+      fprintf(stdout, "Parameter %d to routine %s was incorrect\n", p, rout);
+      fprintf(stderr, "Parameter %d to routine %s was incorrect\n", p, rout);
+      }
+   vfprintf(stdout, form, argptr);
+   vfprintf(stderr, form, argptr);
+   va_end(argptr);
+   laerror("terminating in cblas_xerbla");
+}
+
+extern "C" int cblas_errprn(int ierr, int info, char *form, ...) {
 	char msg0[1024], *msg;
 	va_list argptr;
 	va_start(argptr, form);
@@ -98,5 +117,6 @@ int cblas_errprn(int ierr, int info, char *form, ...) {
 	laerror(msg0);
 	return 0;
 }
+#endif
 
 }//namespace
diff --git a/mat.cc b/mat.cc
index 210d77d..62a9f5e 100644
--- a/mat.cc
+++ b/mat.cc
@@ -26,11 +26,8 @@
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <errno.h>
+#include <unistd.h>
 
-extern "C" {
-	extern ssize_t read(int, void *, size_t);
-	extern ssize_t write(int, const void *, size_t);
-}
 
 namespace LA {
 
@@ -77,14 +74,14 @@ const NRMat<T> NRMat<T>::otimes(const NRMat<T> &rhs, bool reversecolumns) const
 		{
 			T c = (*this)(i,j);
 			for(k=0;k<rhs.nn;k++) for(l=0;l<rhs.mm;l++)
-				r( i*rhs.nn + k, l*mm + j ) = c*rhs(k,l);
+				r( i*(size_t)rhs.nn + k, l*mm + j ) = c*rhs(k,l);
 		}
 	}else{
 		for(i=0;i<nn;i++) for(j=0;j<mm;j++)
 		{
 			T c=(*this)(i,j);
 			for(k=0;k<rhs.nn;k++) for(l=0;l<rhs.mm;l++)
-				r( i*rhs.nn+k, j*rhs.mm+l ) = c *rhs(k,l);
+				r( i*(size_t)rhs.nn+k, j*(size_t)rhs.mm+l ) = c *rhs(k,l);
 		}
 	}
 
@@ -108,7 +105,7 @@ const NRVec<T> NRMat<T>::row(const int i, int l) const {
 #ifdef MATPTR
 	        v[i]
 #else
-		v + i*l
+		v + i*(size_t)l
 #endif
 		, l);
 	return r;
@@ -144,7 +141,7 @@ void NRMat<T>::put(int fd, bool dim, bool transp) const {
 				#ifdef MATPTR
 	                                v[i][j]
 				#else
-	                                v[i*mm+j]
+	                                v[i*(size_t)mm+j]
 				#endif
 					,dim ,transp);
 			}
@@ -196,7 +193,7 @@ void NRMat<T>::get(int fd, bool dim, bool transp){
 				#ifdef MATPTR
 					v[i][j]
 				#else
-					v[i*mm+j]
+					v[i*(size_t)mm+j]
 				#endif
 				,dim,transp);
 			}
@@ -476,13 +473,13 @@ NRMat<T> & NRMat<T>::operator-=(const T &a) {
  ******************************************************************************/
 template <>
 const NRMat<double> NRMat<double>::operator-() const {
-	const int nm = nn*mm;
+	const size_t nm = (size_t)nn*mm;
         NRMat<double> result(nn, mm, getlocation());
 #ifdef CUDALA
         if(location == cpu) {
 #endif
 #ifdef MATPTR
-	        for(register int i=0; i<nm; i++) result.v[0][i] = -v[0][i];
+	        for(register size_t i=0; i<nm; i++) result.v[0][i] = -v[0][i];
 #else
 		memcpy(result.v, v, nm*sizeof(double));
 		cblas_dscal(nm, -1., result.v, 1);
@@ -506,13 +503,13 @@ const NRMat<double> NRMat<double>::operator-() const {
  ******************************************************************************/
 template <>
 const NRMat<complex<double> > NRMat<complex<double> >::operator-() const {
-	const int nm = nn*mm;
+	const size_t nm = (size_t)nn*mm;
         NRMat<complex<double> > result(nn, mm, getlocation());
 #ifdef CUDALA
         if(location == cpu) {
 #endif
 #ifdef MATPTR
-	        for(register int i=0; i<nm; i++) result.v[0][i]= -v[0][i];
+	        for(register size_t i=0; i<nm; i++) result.v[0][i]= -v[0][i];
 #else
 		memcpy(result.v, v, nm*sizeof(complex<double>));
 		cblas_zscal(nm, &CMONE, result.v, 1);
@@ -539,9 +536,9 @@ const NRMat<T> NRMat<T>::operator-() const {
 
 	NRMat<T> result(nn, mm, getlocation());
 #ifdef MATPTR
-	for(register int i=0; i<nn*mm; i++) result.v[0][i] = -v[0][i];
+	for(register size_t i=0; i<(size_t)nn*mm; i++) result.v[0][i] = -v[0][i];
 #else
-	for(register int i=0; i<nn*mm; i++) result.v[i] = -v[i];
+	for(register size_t i=0; i<(size_t)nn*mm; i++) result.v[i] = -v[i];
 #endif
 	return result;
 }
@@ -562,11 +559,11 @@ const NRMat<T> NRMat<T>::operator&(const NRMat<T> &b) const {
 		if(sizeof(T)%sizeof(float) != 0) laerror("memory alignment problem");
 
 		for(register int i=0; i<nn; i++){
-			cublasScopy(mm*sizeof(T)/sizeof(float), (float*)(v + i*mm), 1, (float*)(result.v + i*(mm + b.mm)), 1);
+			cublasScopy(mm*sizeof(T)/sizeof(float), (float*)(v + i*(size_t)mm), 1, (float*)(result.v + i*(size_t)(mm + b.mm)), 1);
 			TEST_CUBLAS("cublasScopy");
 		}
 		for(register int i=0; i<b.nn; i++){
-			cublasScopy(mm*sizeof(T)/sizeof(float), (float*)(b.v + i*b.mm), 1, (float*)(result.v + (nn + i)*(mm + b.mm)), 1);
+			cublasScopy(mm*sizeof(T)/sizeof(float), (float*)(b.v + i*(size_t)b.mm), 1, (float*)(result.v + (nn + i)*(mm + b.mm)), 1);
 			TEST_CUBLAS("cublasScopy");
 		}
 	}
@@ -582,7 +579,7 @@ const NRMat<T> NRMat<T>::operator|(const NRMat<T> &b) const {
 		for (int j=0; j<mm; j++)
 			for (int k=0; k<b.nn; k++)
 				for (int l=0; l<b.mm; l++)
-					result[i*b.nn+k][j*b.mm+l] = (*this)[i][j]*b[k][l];
+					result[i*(size_t)b.nn+k][j*(size_t)b.mm+l] = (*this)[i][j]*b[k][l];
 	return result;
 }
 
@@ -689,7 +686,7 @@ const NRVec<double> NRMat<double>::rsum() const {
 #ifdef CUDALA
 	}else{
 		for(register int i=0;i<nn;i++){
-			cublasDaxpy(mm, 1.0, v + i*mm, 1, result.v, 1);
+			cublasDaxpy(mm, 1.0, v + i*(size_t)mm, 1, result.v, 1);
 			TEST_CUBLAS("cublasDaxpy");
 		}
 	}
@@ -714,7 +711,7 @@ const NRVec<complex<double> > NRMat<complex<double> >::rsum() const {
 #ifdef CUDALA
 	}else{
 		for(register int i=0;i<nn;i++){
-			cublasZaxpy(mm, CUONE, (cuDoubleComplex*)(v + i*mm), 1, (cuDoubleComplex*)(result.v), 1);
+			cublasZaxpy(mm, CUONE, (cuDoubleComplex*)(v + i*(size_t)mm), 1, (cuDoubleComplex*)(result.v), 1);
 			TEST_CUBLAS("cublasZaxpy");
 		}
 	}
@@ -748,14 +745,14 @@ const NRMat<T> NRMat<T>::submatrix(const int fromrow, const int torow, const int
 			#ifdef MATPTR
 			memcpy(r.v[i - fromrow], v[i] + fromcol, m*sizeof(T));
 			#else
-			memcpy(r.v+(i - fromrow)*m, v + i*mm + fromcol, m*sizeof(T));
+			memcpy(r.v+(i - fromrow)*m, v + i*(size_t)mm + fromcol, m*sizeof(T));
 			#endif
 		}
 #ifdef CUDALA
 	}else{
 		if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem");
 		for(register int i=fromrow; i<=torow; ++i){
-			cublasScopy(m*sizeof(T)/sizeof(float), (const float *)(v + i*mm + fromcol), 1, (float*)(r.v + (i - fromrow)*m), 1);
+			cublasScopy(m*sizeof(T)/sizeof(float), (const float *)(v + i*(size_t)mm + fromcol), 1, (float*)(r.v + (i - fromrow)*m), 1);
 			TEST_CUBLAS("cublasScopy");
 		}
 	}
@@ -786,13 +783,13 @@ void NRMat<T>::storesubmatrix(const int fromrow, const int fromcol, const NRMat
 		#ifdef MATPTR
 			memcpy(v[i] + fromcol, rhs.v[i - fromrow], m*sizeof(T));
 		#else
-			memcpy(v + i*mm + fromcol, rhs.v + (i - fromrow)*m, m*sizeof(T));
+			memcpy(v + i*(size_t)mm + fromcol, rhs.v + (i - fromrow)*m, m*sizeof(T));
 		#endif
 
 	#ifdef CUDALA
 		}else{
 			if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem");
-			cublasScopy(m*sizeof(T)/sizeof(float), (const float *) (rhs.v + (i - fromrow)*m), 1, (float *)(v + i*mm + fromcol), 1);
+			cublasScopy(m*sizeof(T)/sizeof(float), (const float *) (rhs.v + (i - fromrow)*m), 1, (float *)(v + i*(size_t)mm + fromcol), 1);
 		}
 	#endif
 	}
@@ -821,8 +818,8 @@ NRMat<T>& NRMat<T>::transposeme(const int _n) {
 				v[j][i] = tmp;
 			#else
 				register int a, b; 
-				a = i*mm + j;
-				b = j*mm + i;
+				a = i*(size_t)mm + j;
+				b = j*(size_t)mm + i;
 				T tmp = v[a];
 				v[a] = v[b];
 				v[b] = tmp;
@@ -847,7 +844,7 @@ NRMat<T>& NRMat<T>::transposeme(const int _n) {
  ******************************************************************************/
 template<>
 NRMat<complex<double> >::NRMat(const NRMat<double> &rhs, bool imagpart): nn(rhs.nrows()), mm(rhs.ncols()), count(new int(1)) {
-	const int nn_mm = nn*mm;
+	const size_t nn_mm = (size_t)nn*mm;
 #ifdef CUDALA
 	if(location == cpu){
 #endif
@@ -888,7 +885,7 @@ NRMat<complex<double> >::NRMat(const NRMat<double> &rhs, bool imagpart): nn(rhs.
  ******************************************************************************/
 template<>
 NRMat<double>::NRMat(const NRMat<complex<double> > &rhs, bool imagpart): nn(rhs.nrows()), mm(rhs.ncols()), count(new int(1)) {
-	const int nn_mm = nn*mm;
+	const size_t nn_mm = (size_t) nn*mm;
 #ifdef CUDALA
 	if(location == cpu){
 #endif
@@ -1079,7 +1076,7 @@ const NRSMat<double>  NRMat<double>::timestransposed() const {
 			#ifdef MATPTR
 				r(i, j) = cblas_ddot(mm, v[i], 1, v[j], 1);
 			#else
-				r(i, j) = cblas_ddot(mm, v + i*mm, 1, v + j*mm, 1);
+				r(i, j) = cblas_ddot(mm, v + i*(size_t)mm, 1, v + j*(size_t)mm, 1);
 			#endif
 			}
 		}
@@ -1087,7 +1084,7 @@ const NRSMat<double>  NRMat<double>::timestransposed() const {
 	}else{
 		for(i=0; i<nn; ++i){
 			for(j=0; j<=i; ++j){
-				r(i, j) = cublasDdot(nn, v + i*mm, 1, v + j*mm, 1);
+				r(i, j) = cublasDdot(nn, v + i*(size_t)mm, 1, v + j*(size_t)mm, 1);
 				TEST_CUBLAS("cublasDdot");
 			}
 		}
@@ -1113,7 +1110,7 @@ const NRSMat<complex<double> >  NRMat<complex<double> >::timestransposed() const
 			#ifdef MATPTR
 				cblas_zdotc_sub(nn, v[i], 1, v[j], 1, &r(i,j));
 			#else
-				cblas_zdotc_sub(nn, v + i*mm, 1, v + j*mm, 1, &r(i,j));
+				cblas_zdotc_sub(nn, v + i*(size_t)mm, 1, v + j*(size_t)mm, 1, &r(i,j));
 			#endif
 			}
 		}
@@ -1121,7 +1118,7 @@ const NRSMat<complex<double> >  NRMat<complex<double> >::timestransposed() const
 	}else{
 		for(i=0; i<mm; ++i){
 			for(j=0; j<=i; ++j){
-				cuDoubleComplex val = cublasZdotc(nn, (const cuDoubleComplex *)(v + i*mm), 1, (const cuDoubleComplex *)(v + j*mm), 1);
+				cuDoubleComplex val = cublasZdotc(nn, (const cuDoubleComplex *)(v + i*(size_t)mm), 1, (const cuDoubleComplex *)(v + j*(size_t)mm), 1);
 				TEST_CUBLAS("cublasZdotc");
 				r(i, j) = *(reinterpret_cast<complex<double>*> (&val));
 			}
@@ -1172,7 +1169,7 @@ void NRMat<double>::randomize(const double &x) {
 	}else{
 		NRMat<double> tmp(nn, mm, cpu);
 		double *tmp_data = tmp;
-		for(register int i=0; i<nn*mm; ++i){
+		for(register size_t i=0; i<(size_t)nn*mm; ++i){
 			tmp_data[i] = x*(2.*random()/(1. + RAND_MAX) - 1.);
 		}
 		tmp.moveto(this->location);
@@ -1203,7 +1200,7 @@ void NRMat<complex<double> >::randomize(const double &x) {
 	}else{
 		NRMat<complex<double> > tmp(nn, mm, cpu);
 		complex<double> *tmp_data = tmp;
-		for(register int i=0; i<nn*mm; ++i){
+		for(register size_t i=0; i<(size_t)nn*mm; ++i){
 			const double re = x*(2.*random()/(1. + RAND_MAX) - 1.);
 			const double im = x*(2.*random()/(1. + RAND_MAX) - 1.);
 			tmp_data[i] = complex<double>(re, im);
@@ -1226,10 +1223,10 @@ NRMat<double>& NRMat<double>::operator*=(const double &a) {
 #ifdef CUDALA
 	if(location == cpu){
 #endif
-		cblas_dscal(nn*mm, a, *this, 1);
+		cblas_dscal((size_t)nn*mm, a, *this, 1);
 #ifdef CUDALA
 	}else{
-		cublasDscal(nn*mm, a, v, 1);
+		cublasDscal((size_t)nn*mm, a, v, 1);
 		TEST_CUBLAS("cublasDscal");
 	}
 #endif
@@ -1249,11 +1246,11 @@ NRMat<complex<double> >::operator*=(const complex<double> &a) {
 #ifdef CUDALA
 	if(location == cpu){
 #endif
-		cblas_zscal(nn*mm, &a, (*this)[0], 1);
+		cblas_zscal((size_t)nn*mm, &a, (*this)[0], 1);
 #ifdef CUDALA
 	}else{
 		const cuDoubleComplex fac = *(reinterpret_cast<const cuDoubleComplex*> (&a));
-		cublasZscal(nn*mm, fac, (cuDoubleComplex *)v, 1);
+		cublasZscal((size_t)nn*mm, fac, (cuDoubleComplex *)v, 1);
 		TEST_CUBLAS("cublasZscal");
 	}
 #endif
@@ -1271,9 +1268,9 @@ NRMat<T> & NRMat<T>::operator*=(const T &a) {
 	NOT_GPU(*this);
         copyonwrite();
 #ifdef MATPTR
-         for(register int i=0; i< nn*mm; i++) v[0][i] *= a;
+         for(register size_t i=0; i< (size_t)nn*mm; i++) v[0][i] *= a;
 #else
-         for(register int i=0; i< nn*mm; i++) v[i] *= a;
+         for(register size_t i=0; i< (size_t)nn*mm; i++) v[i] *= a;
 #endif
         return *this;
 }
@@ -1294,10 +1291,10 @@ NRMat<double> & NRMat<double>::operator+=(const NRMat<double>  &rhs) {
 #ifdef CUDALA
 	if(location == cpu){
 #endif
-		cblas_daxpy(nn*mm, 1.0, rhs, 1, *this, 1);
+		cblas_daxpy((size_t)nn*mm, 1.0, rhs, 1, *this, 1);
 #ifdef CUDALA
 	}else{
-		cublasDaxpy(nn*mm, 1.0, rhs, 1, v, 1);
+		cublasDaxpy((size_t)nn*mm, 1.0, rhs, 1, v, 1);
 		TEST_CUBLAS("cublasDaxpy");
 	}
 #endif
@@ -1320,10 +1317,10 @@ NRMat<complex<double> >::operator+=(const NRMat< complex<double> >  &rhs) {
 #ifdef CUDALA
 	if(location == cpu){
 #endif
-		cblas_zaxpy(nn*mm, &CONE, rhs[0], 1, (*this)[0], 1);
+		cblas_zaxpy((size_t)nn*mm, &CONE, rhs[0], 1, (*this)[0], 1);
 #ifdef CUDALA
 	}else{
-		cublasZaxpy(nn*mm, CUONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1);
+		cublasZaxpy((size_t)nn*mm, CUONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1);
 	}
 #endif
 	return *this;
@@ -1345,9 +1342,9 @@ NRMat<T> & NRMat<T>::operator+=(const NRMat<T>  &rhs) {
 	copyonwrite();
 
 	#ifdef MATPTR
-		for(int i=0; i< nn*mm; i++) v[0][i] += rhs.v[0][i];
+		for(size_t i=0; i< (size_t)nn*mm; i++) v[0][i] += rhs.v[0][i];
 	#else
-		for(int i=0; i< nn*mm; i++) v[i] += rhs.v[i];
+		for(size_t i=0; i< (size_t)nn*mm; i++) v[i] += rhs.v[i];
 	#endif
         return *this;
 }
@@ -1368,10 +1365,10 @@ NRMat<double> & NRMat<double>::operator-=(const NRMat<double>  &rhs) {
 #ifdef CUDALA
 	if(location == cpu){
 #endif
-		cblas_daxpy(nn*mm, -1.0, rhs, 1, *this, 1);
+		cblas_daxpy((size_t)nn*mm, -1.0, rhs, 1, *this, 1);
 #ifdef CUDALA
 	}else{
-		cublasDaxpy(nn*mm, -1.0, rhs, 1, v, 1);
+		cublasDaxpy((size_t)nn*mm, -1.0, rhs, 1, v, 1);
 	}
 #endif
 	return *this;
@@ -1395,10 +1392,10 @@ NRMat< complex<double> >::operator-=(const NRMat< complex<double> >  &rhs) {
 #ifdef CUDALA
 	if(location == cpu){
 #endif
-		cblas_zaxpy(nn*mm, &CMONE, rhs[0], 1, (*this)[0], 1);
+		cblas_zaxpy((size_t)nn*mm, &CMONE, rhs[0], 1, (*this)[0], 1);
 #ifdef CUDALA
 	}else{
-		cublasZaxpy(nn*mm, CUMONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1);
+		cublasZaxpy((size_t)nn*mm, CUMONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1);
 	}
 #endif
 	return *this;
@@ -1421,9 +1418,9 @@ NRMat<T> & NRMat<T>::operator-=(const NRMat<T>  &rhs) {
 	copyonwrite();
 
 	#ifdef MATPTR
-		for(int i=0; i< nn*mm; i++) v[0][i] += rhs.v[0][i];
+		for(size_t i=0; i< (size_t)nn*mm; i++) v[0][i] += rhs.v[0][i];
 	#else
-		for(int i=0; i< nn*mm; i++) v[i] += rhs.v[i];
+		for(size_t i=0; i<(size_t) nn*mm; i++) v[i] += rhs.v[i];
 	#endif
         return *this;
 }
@@ -1693,10 +1690,10 @@ const double NRMat<double>::dot(const NRMat<double> &rhs) const {
 #ifdef CUDALA
 	if(location == cpu){
 #endif
-		ret = cblas_ddot(nn*mm, (*this)[0], 1, rhs[0], 1);
+		ret = cblas_ddot((size_t)nn*mm, (*this)[0], 1, rhs[0], 1);
 #ifdef CUDALA
 	}else{
-		ret = cublasDdot(nn*mm, v, 1, rhs.v, 1);
+		ret = cublasDdot((size_t)nn*mm, v, 1, rhs.v, 1);
 	}
 #endif
 	return ret;
@@ -1721,10 +1718,10 @@ NRMat<complex<double> >::dot(const NRMat<complex<double> > &rhs) const {
 #ifdef CUDALA
 	if(location == cpu){
 #endif
-		cblas_zdotc_sub(nn*mm, (*this)[0], 1, rhs[0], 1, &ret);
+		cblas_zdotc_sub((size_t)nn*mm, (*this)[0], 1, rhs[0], 1, &ret);
 #ifdef CUDALA
 	}else{
-		cuDoubleComplex val = cublasZdotc(nn*mm, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)(rhs.v), 1);
+		cuDoubleComplex val = cublasZdotc((size_t)nn*mm, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)(rhs.v), 1);
 		ret = *(reinterpret_cast<complex<double>*> (&val));
 	}
 #endif
@@ -1804,7 +1801,7 @@ void NRMat<double>::diagmultl(const NRVec<double> &rhs) {
 		for(register int i=0; i<nn; i++){ cblas_dscal(mm, rhs[i], (*this)[i], 1); }
 #ifdef CUDALA
 	}else{
-		for(register int i=0; i<nn; i++){ cublasDscal(mm, rhs[i], v + i*mm, 1); }
+		for(register int i=0; i<nn; i++){ cublasDscal(mm, rhs[i], v + i*(size_t)mm, 1); }
 	}
 #endif
 }
@@ -1830,7 +1827,7 @@ void NRMat< complex<double> >::diagmultl(const NRVec< complex<double> > &rhs) {
 	}else{
 		for(register int i=0; i<nn; i++){
 			const cuDoubleComplex alpha = make_cuDoubleComplex(rhs[i].real(), rhs[i].imag());
-			cublasZscal(mm, alpha, (cuDoubleComplex*)(v + i*mm), 1);
+			cublasZscal(mm, alpha, (cuDoubleComplex*)(v + i*(size_t)mm), 1);
 		}
 	}
 #endif
@@ -1913,7 +1910,7 @@ NRMat<double>::operator*(const NRSMat<double> &rhs) const {
 #ifdef CUDALA
 	}else{
 		for(register int i=0; i<nn; i++){
-			cublasDspmv('U', mm, 1.0, rhs.v, v + i*mm, 1, 0.0, result.v + i*rhs_ncols, 1);
+			cublasDspmv('U', mm, 1.0, rhs.v, v + i*(size_t)mm, 1, 0.0, result.v + i*(size_t)rhs_ncols, 1);
 		}
 	}
 #endif
@@ -1947,7 +1944,7 @@ NRMat< complex<double> >::operator*(const NRSMat< complex<double> > &rhs) const
 #ifdef CUDALA
 	}else{
 		for(register int i=0; i<nn; i++){
-			cublasZhpmv('U', mm, CUONE, (cuDoubleComplex*)rhs.v, (cuDoubleComplex*)(v + i*mm), 1, CUZERO, (cuDoubleComplex*)(result.v + i*rhs_ncols), 1);
+			cublasZhpmv('U', mm, CUONE, (cuDoubleComplex*)rhs.v, (cuDoubleComplex*)(v + i*(size_t)mm), 1, CUZERO, (cuDoubleComplex*)(result.v + i*(size_t)rhs_ncols), 1);
 		}
 	}
 #endif
@@ -1974,10 +1971,10 @@ NRMat<complex<double> >& NRMat<complex<double> >::conjugateme() {
 #ifdef CUDALA
 	if(location == cpu){
 #endif
-		cblas_dscal(mm*nn, -1.0, (double *)((*this)[0]) + 1, 2);
+		cblas_dscal((size_t)mm*nn, -1.0, (double *)((*this)[0]) + 1, 2);
 #ifdef CUDALA
 	}else{
-		cublasDscal(mm*nn, -1.0, (double *)(this->v) + 1, 2);
+		cublasDscal((size_t)mm*nn, -1.0, (double *)(this->v) + 1, 2);
 	}
 #endif
 	return *this;
@@ -2048,12 +2045,12 @@ void NRMat<double>::gemm(const double &beta, const NRMat<double> &a,
 		const char transa, const NRMat<double> &b, const char transb, 
 		const double &alpha) {
 
-	int k(transa=='n'?a.mm:a.nn);
+	int k(tolower(transa)=='n'?a.mm:a.nn);
 
 #ifdef DEBUG
-	int l(transa=='n'?a.nn:a.mm);
-	int kk(transb=='n'?b.nn:b.mm);
-	int ll(transb=='n'?b.mm:b.nn);
+	int l(tolower(transa)=='n'?a.nn:a.mm);
+	int kk(tolower(transb)=='n'?b.nn:b.mm);
+	int ll(tolower(transb)=='n'?b.mm:b.nn);
 	if (l!=nn || ll!=mm || k!=kk) laerror("incompatible matrices in NRMat<double>::gemm(...)");
 	if(b.mm <=0 || mm<=0) laerror("illegal matrix dimension in gemm");
 #endif
@@ -2066,8 +2063,8 @@ void NRMat<double>::gemm(const double &beta, const NRMat<double> &a,
 #ifdef CUDALA
 	if(location == cpu){
 #endif
-		cblas_dgemm(CblasRowMajor, (transa=='n' ? CblasNoTrans : CblasTrans),
-				(transb=='n' ? CblasNoTrans : CblasTrans), nn, mm, k, alpha, a,
+		cblas_dgemm(CblasRowMajor, (tolower(transa)=='n' ? CblasNoTrans : CblasTrans),
+				(tolower(transb)=='n' ? CblasNoTrans : CblasTrans), nn, mm, k, alpha, a,
 				a.mm, b , b.mm, beta, *this , mm);
 #ifdef CUDALA
 	}else{
@@ -2083,20 +2080,20 @@ void NRMat<complex<double> >::gemm(const complex<double> & beta,
 		const NRMat<complex<double> > & b, const char transb, 
 		const complex<double> & alpha)
 {
-	int k(transa=='n'?a.mm:a.nn);
+	int k(tolower(transa)=='n'?a.mm:a.nn);
 
 #ifdef DEBUG
-	int l(transa=='n'?a.nn:a.mm);
-	int kk(transb=='n'?b.nn:b.mm);
-	int ll(transb=='n'?b.mm:b.nn);
+	int l(tolower(transa)=='n'?a.nn:a.mm);
+	int kk(tolower(transb)=='n'?b.nn:b.mm);
+	int ll(tolower(transb)=='n'?b.mm:b.nn);
 	if (l!=nn || ll!=mm || k!=kk) laerror("incompatible matrices in NRMat<complex<double> >::gemm(...)");
 #endif
 	if (alpha==CZERO && beta==CONE) return;
 
 	copyonwrite();
 	cblas_zgemm(CblasRowMajor,
-			(transa=='n' ? CblasNoTrans : (transa=='c'?CblasConjTrans:CblasTrans)), 
-			(transb=='n' ? CblasNoTrans : (transa=='c'?CblasConjTrans:CblasTrans)),
+			(tolower(transa)=='n' ? CblasNoTrans : (tolower(transa)=='c'?CblasConjTrans:CblasTrans)), 
+			(tolower(transb)=='n' ? CblasNoTrans : (tolower(transb)=='c'?CblasConjTrans:CblasTrans)),
 			nn, mm, k, &alpha, a , a.mm, b , b.mm, &beta, *this , mm);
 }
 
@@ -2113,10 +2110,10 @@ const double  NRMat<double>::norm(const double scalar) const {
 #ifdef CUDALA
 		if(location == cpu){
 #endif
-			return cblas_dnrm2(nn*mm, (*this)[0], 1);
+			return cblas_dnrm2((size_t)nn*mm, (*this)[0], 1);
 #ifdef CUDALA
 		}else{
-			return cublasDnrm2(nn*mm, v, 1);
+			return cublasDnrm2((size_t)nn*mm, v, 1);
 		}
 #endif
 	}
@@ -2130,7 +2127,7 @@ const double  NRMat<double>::norm(const double scalar) const {
 #ifdef MATPTR
 			tmp = v[i][j];
 #else
-			tmp = v[i*mm+j];
+			tmp = v[i*(size_t)mm+j];
 #endif
 			if(i == j) tmp -= scalar;
 			sum += tmp*tmp;
@@ -2152,10 +2149,10 @@ const double NRMat<complex<double> >::norm(const complex<double> scalar) const {
 #ifdef CUDALA
 		if(location == cpu){
 #endif
-			return cblas_dznrm2(nn*mm, (*this)[0], 1);
+			return cblas_dznrm2((size_t)nn*mm, (*this)[0], 1);
 #ifdef CUDALA
 		}else{
-			return cublasDznrm2(nn*mm, (cuDoubleComplex*)v, 1);
+			return cublasDznrm2((size_t)nn*mm, (cuDoubleComplex*)v, 1);
 		}
 #endif
 	}
@@ -2168,7 +2165,7 @@ const double NRMat<complex<double> >::norm(const complex<double> scalar) const {
 #ifdef MATPTR
 			tmp = v[i][j];
 #else
-			tmp = v[i*mm+j];
+			tmp = v[i*(size_t)mm+j];
 #endif
 			if(i == j) tmp -= scalar;
 			const double re = tmp.real();
@@ -2195,10 +2192,10 @@ void NRMat<double>::axpy(const double alpha, const NRMat<double> &mat) {
 #ifdef CUDALA
 	if(location == cpu){
 #endif
-		cblas_daxpy(nn*mm, alpha, mat, 1, *this, 1);
+		cblas_daxpy((size_t)nn*mm, alpha, mat, 1, *this, 1);
 #ifdef CUDALA
 	}else{
-		cublasDaxpy(nn*mm, alpha, mat, 1, *this, 1);
+		cublasDaxpy((size_t)nn*mm, alpha, mat, 1, *this, 1);
 	}
 #endif
 }
@@ -2221,7 +2218,7 @@ void NRMat<complex<double> >::axpy(const complex<double> alpha,
 #ifdef CUDALA
 	if(location == cpu){
 #endif
-		cblas_zaxpy(nn*mm, &alpha, mat, 1, (*this)[0], 1);
+		cblas_zaxpy((size_t)nn*mm, &alpha, mat, 1, (*this)[0], 1);
 #ifdef CUDALA
 	}else{
 		const cuDoubleComplex _alpha = make_cuDoubleComplex(alpha.real(), alpha.imag());
@@ -2245,7 +2242,7 @@ const T NRMat<T>::trace() const {
 #ifdef MATPTR
 	for(register int i=0; i<nn; ++i) sum += v[i][i];
 #else
-	for(register int i=0; i<nn*nn; i += (nn+1)) sum += v[i];
+	for(register size_t i=0; i<(size_t)nn*nn; i += (nn+1)) sum += v[i];
 #endif
 	return sum;
 }
@@ -2554,7 +2551,7 @@ NRMat<double>& NRMat<double>::swap_rows(){
 #ifdef CUDALA
 	}else{
 	        for(register int i=0; i<n_pul; i++){
-		        cublasDswap(mm, v + i*mm, 1, v + (nn - i - 1)*mm, 1);
+		        cublasDswap(mm, v + i*(size_t)mm, 1, v + (nn - i - 1)*mm, 1);
 			TEST_CUBLAS("cublasDswap");
 	        }
 	}
@@ -2580,7 +2577,7 @@ NRMat<complex<double> >& NRMat<complex<double> >::swap_rows(){
 #ifdef CUDALA
 	}else{
 		for(register int i=0; i<n_pul; i++){
-		        cublasZswap(mm, (cuDoubleComplex*)(v + i*mm), 1, (cuDoubleComplex*)(v + (nn - i - 1)*mm), 1);
+		        cublasZswap(mm, (cuDoubleComplex*)(v + i*(size_t)mm), 1, (cuDoubleComplex*)(v + (nn - i - 1)*mm), 1);
 			TEST_CUBLAS("cublasZswap");
 		}
 	}
@@ -2613,7 +2610,7 @@ NRMat<T>& NRMat<T>::swap_rows(){
 	}else{
 		if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem in NRMat<T>::swap_rows");
 	        for(register int i=0; i<n_pul; i++){
-		        cublasSswap(mm*sizeof(T)/sizeof(float), (float *)(v + i*mm), 1, (float *)(v + (nn - i - 1)*mm), 1);
+		        cublasSswap(mm*sizeof(T)/sizeof(float), (float *)(v + i*(size_t)mm), 1, (float *)(v + (nn - i - 1)*mm), 1);
 			TEST_CUBLAS("cublasSswap");
 	        }
 	}
@@ -2745,7 +2742,7 @@ NRMat<double>& NRMat<double>::swap_rows_cols(){
 #ifdef CUDALA
 	}else{
 		for(register int i=0; i<n_pul; i++){
-			cublasDswap(mm, v + i*mm, 1, v + (nn - i - 1)*mm + mm - 1, -1);
+			cublasDswap(mm, v + i*(size_t)mm, 1, v + (nn - i - 1)*mm + mm - 1, -1);
 			TEST_CUBLAS("cublasDswap");
 		}
 
@@ -2792,7 +2789,7 @@ NRMat<complex<double> >& NRMat<complex<double> >::swap_rows_cols(){
 #ifdef CUDALA
 	}else{
 		for(register int i=0;i<n_pul;i++){
-			cublasZswap(mm, (cuDoubleComplex*)(v + i*mm), 1, (cuDoubleComplex*)(v + (nn - i - 1)*mm + mm - 1), -1);
+			cublasZswap(mm, (cuDoubleComplex*)(v + i*(size_t)mm), 1, (cuDoubleComplex*)(v + (nn - i - 1)*mm + mm - 1), -1);
 			TEST_CUBLAS("cublasZswap");
 		}
 		if(nn & 1){
@@ -2817,7 +2814,7 @@ template<typename T>
 NRMat<T>& NRMat<T>::swap_rows_cols(){
 	const int n_pul = nn >> 1;
 	const int m_pul = mm >> 1;
-	const int dim = nn*mm;
+	const size_t dim = (size_t)nn*mm;
 
 	T *data_ptr;
 	T tmp;
@@ -2837,7 +2834,7 @@ NRMat<T>& NRMat<T>::swap_rows_cols(){
 	}else{
 		if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem in NRMat<T>::swap_rows_cols");
 	        for(register int i=0; i<n_pul; i++){
-		        cublasSswap(mm*sizeof(T)/sizeof(float), (float *)(v + i*mm), 1, (float *)(v + (nn - i - 1)*mm) - 1, -1);
+		        cublasSswap(mm*sizeof(T)/sizeof(float), (float *)(v + i*(size_t)mm), 1, (float *)(v + (nn - i - 1)*mm) - 1, -1);
 			TEST_CUBLAS("cublasSswap");
 	        }
 
diff --git a/mat.h b/mat.h
index e5d9425..f950ea6 100644
--- a/mat.h
+++ b/mat.h
@@ -39,10 +39,10 @@ protected:
 	T *v;//!< pointer to the data stored continuously in emmory
 #endif
 	int *count;//!< reference counter
+public:
 #ifdef CUDALA
 	GPUID location;
 #endif
-public:
 	friend class NRVec<T>;
 	friend class NRSMat<T>;
 	
@@ -89,16 +89,16 @@ public:
 	//! explicit constructor converting vector into a <code>NRMat<T></code> object
 #ifdef MATPTR
 	explicit NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset = 0):NRMat(&rhs[0][0] + offset , n, m){
-		if (offset < 0 || n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");
+		if (offset < 0 || (size_t)n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");
 	};
 #else
 	explicit NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset = 0);
 #endif
 
 #ifdef MATPTR
-	const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits<T>::gencmp(v[0],rhs.v[0],nn*mm);} //memcmp for scalars else elementwise
+	const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits<T>::gencmp(v[0],rhs.v[0],(size_t)nn*mm);} //memcmp for scalars else elementwise
 #else
-        const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits<T>::gencmp(v,rhs.v,nn*mm);} //memcmp for scalars else elementwise
+        const bool operator!=(const NRMat &rhs) const {if(nn!=rhs.nn || mm!=rhs.mm) return 1; return LA_traits<T>::gencmp(v,rhs.v,(size_t)nn*mm);} //memcmp for scalars else elementwise
 #endif
 
 	const bool operator==(const NRMat &rhs) const {return !(*this != rhs);};
@@ -107,7 +107,7 @@ public:
 	inline int getcount() const {return count?*count:0;}
 
 	//! ensure that the data of this matrix are referenced exactly once
-	void copyonwrite();
+	void copyonwrite(bool detachonly=false);
 
 	/***************************************************************************//**
 	 * routines for CUDA related stuff
@@ -260,7 +260,7 @@ public:
 	//! get the number of columns
 	inline int ncols() const;
 	//! get the number of matrix elements
-	inline int size() const;
+	inline size_t size() const;
 
 	//! unformatted input
         void get(int fd, bool dimensions = 1, bool transposed = false);
@@ -274,8 +274,8 @@ public:
 	//! set all matrix elements equal to zero
         void clear(){
 		if(nn&&mm){
-			copyonwrite();
-			LA_traits<T>::clear((*this)[0], nn*mm);
+			copyonwrite(true);
+			LA_traits<T>::clear((*this)[0], (size_t)nn*mm);
 		}
 	};
 
@@ -379,7 +379,7 @@ template <typename T>
 NRMat<T>::NRMat(const int n, const int m, const GPUID loc) : nn(n), mm(m), count(new int) {
 	T* p;
 	*count = 1;
-	const int nm = n*m;
+	const size_t nm = (size_t)n*m;
 #ifdef CUDALA
 	location = (loc==undefined?DEFAULT_LOC:loc);
 	if(location == cpu) {
@@ -408,7 +408,7 @@ NRMat<T>::NRMat(const int n, const int m, const GPUID loc) : nn(n), mm(m), count
  ******************************************************************************/
 template <typename T>
 NRMat<T>::NRMat(const T &a, const int n, const int m, const GPUID loc) : nn(n), mm(m), count(new int) {
-	const int nm = n*m;
+	const size_t nm = (size_t)n*m;
 	T *p;
 	*count = 1;
 
@@ -447,7 +447,7 @@ NRMat<T>::NRMat(const T &a, const int n, const int m, const GPUID loc) : nn(n),
  ******************************************************************************/
 template <typename T>
 NRMat<T>::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new int) {
-	const int nm = n*m;
+	const size_t nm = (size_t)n*m;
 	T *p;
 	*count = 1;
 
@@ -460,7 +460,7 @@ NRMat<T>::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new
 		p = v[0] = new T[nm];
 		for (register int i=1; i<n; i++) v[i] = v[i-1] + m;
 	#else
-		p = v = new T[m*n];
+		p = v = new T[nm];
 	#endif
 	if (a != (T)0)
 		for (register int i=0; i<nm; i++) *p++ = a;
@@ -483,7 +483,7 @@ NRMat<T>::NRMat(const T &a, const int n, const int m) : nn(n), mm(m), count(new
  ******************************************************************************/
 template <typename T>
 NRMat<T>::NRMat(const T *a, const int n, const int m) : nn(n), mm(m), count(new int) {
-	const int nm = n*m;
+	const size_t nm = (size_t)n*m;
 #ifdef CUDALA
 	location = DEFAULT_LOC;
 #endif
@@ -546,10 +546,10 @@ NRMat<T>::NRMat(const NRSMat<T> &rhs) {
 	*count = 1;
 #ifdef MATPTR
 	v = new T*[nn];
-	v[0] = new T[mm*nn];
+	v[0] = new T[(size_t)mm*nn];
 	for (int i=1; i<nn; i++) v[i] = v[i-1] + mm;
 #else
-	v = new T[mm*nn];
+	v = new T[(size_t)mm*nn];
 #endif
 
 #ifdef MATPTR
@@ -561,7 +561,7 @@ NRMat<T>::NRMat(const NRSMat<T> &rhs) {
 #else
 	for (i=0; i<nn; i++){
 		for (j=0; j<=i; j++){
-			v[i*nn + j] = v[j*nn + i] = rhs[k++];
+			v[i*(size_t)nn + j] = v[j*(size_t)nn + i] = rhs[k++];
 		}
 	}
 #endif
@@ -578,7 +578,7 @@ NRMat<T>::NRMat(const NRSMat<T> &rhs) {
 template <typename T>
 NRMat<T>::NRMat(const NRVec<T> &rhs, const int n, const int m, const int offset)
 {
-	if (offset < 0 || n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");
+	if (offset < 0 || (size_t)n*m + offset > rhs.nn) laerror("matrix dimensions and offset incompatible with vector length");
 
 #ifdef CUDALA
 	location=rhs.location;
@@ -628,7 +628,7 @@ inline T* NRMat<T>::operator[](const int i) {
 	#ifdef MATPTR
 		return v[i];
 	#else
-		return v + i*mm;
+		return v + i*(size_t)mm;
 	#endif
 }
 
@@ -646,7 +646,7 @@ inline const T* NRMat<T>::operator[](const int i) const {
 	#ifdef MATPTR
 		return v[i];
 	#else
-		return v + i*mm;
+		return v + i*(size_t)mm;
 	#endif
 }
 
@@ -668,7 +668,7 @@ inline T& NRMat<T>::operator()(const int i, const int j){
 	#ifdef MATPTR
 		return v[i][j];
 	#else
-		return v[i*mm + j];
+		return v[i*(size_t)mm + j];
 	#endif
 }
 
@@ -689,7 +689,7 @@ inline const T& NRMat<T>::operator()(const int i, const int j) const{
 	#ifdef MATPTR
 		return v[i][j];
 	#else
-		return v[i*mm + j];
+		return v[i*(size_t)mm + j];
 	#endif
 }
 
@@ -712,11 +712,11 @@ inline const T NRMat<T>::get_ij(const int i, const int j) const{
 	#ifdef MATPTR
 		return v[i][j];
 	#else
-		return v[i*mm + j];
+		return v[i*(size_t)mm + j];
 	#endif
 #ifdef CUDALA
 	}else{
-		const int pozice = i*mm + j;
+		const size_t pozice = i*(size_t)mm + j;
 		gpuget(1, sizeof(T), v + pozice, &ret);
 		return ret;
 	}
@@ -743,8 +743,8 @@ inline int NRMat<T>::ncols() const{
  * @return number of elements
  ******************************************************************************/
 template <typename T>
-inline int NRMat<T>::size() const{
-	return nn*mm;
+inline size_t NRMat<T>::size() const{
+	return (size_t)nn*mm;
 }
 
 /***************************************************************************//**
@@ -795,7 +795,7 @@ inline const double NRMat<double>::amax() const{
 #ifdef CUDALA
 	}else{
 		double ret(0.0);
-		const int pozice = cublasIdamax(nn*mm, v, 1) - 1;
+		const size_t pozice = cublasIdamax((size_t)nn*mm, v, 1) - 1;
 		TEST_CUBLAS("cublasIdamax");
 		gpuget(1, sizeof(double), v + pozice, &ret);
 		return ret;
@@ -815,7 +815,7 @@ inline const double NRMat<double>::amin() const{
 	if(location == cpu){
 #endif
 		// idamin seems not to be supported
-		const int nm = nn*mm;
+		const size_t nm = (size_t)nn*mm;
 		double val(0.0);
 		int index(-1);
 		ret = std::numeric_limits<double>::max();
@@ -834,7 +834,7 @@ inline const double NRMat<double>::amin() const{
 		#endif	
 #ifdef CUDALA
 	}else{
-		const int pozice = cublasIdamin(nn*mm, v, 1) - 1;
+		const size_t pozice = cublasIdamin((size_t)nn*mm, v, 1) - 1;
 		TEST_CUBLAS("cublasIdamin");
 		gpuget(1, sizeof(double), v + pozice, &ret);
 	}
@@ -860,7 +860,7 @@ inline const complex<double> NRMat<complex<double> >::amax() const{
 #ifdef CUDALA
 	}else{
 		complex<double> ret(0.0, 0.0);
-		const int pozice = cublasIzamax(nn*mm, (cuDoubleComplex*)v, 1) - 1;
+		const size_t pozice = cublasIzamax((size_t)nn*mm, (cuDoubleComplex*)v, 1) - 1;
 		TEST_CUBLAS("cublasIzamax");
 		gpuget(1, sizeof(complex<double>), v + pozice, &ret);
 		return ret;
@@ -881,7 +881,7 @@ inline const complex<double> NRMat<complex<double> >::amin() const{
 	if(location == cpu){
 #endif
 		// idamin seems not to be supported
-		const int nm = nn*mm;
+		const size_t nm = (size_t)nn*mm;
 		int index(-1);
 		double val(0.0), min_val(0.0);
 		complex<double> z_val(0.0, 0.0);
@@ -903,7 +903,7 @@ inline const complex<double> NRMat<complex<double> >::amin() const{
 		#endif	
 #ifdef CUDALA
 	}else{
-		const int pozice = cublasIzamin(nn*mm, (cuDoubleComplex*)v, 1) - 1;
+		const size_t pozice = cublasIzamin((size_t)nn*mm, (cuDoubleComplex*)v, 1) - 1;
 		TEST_CUBLAS("cublasIzamin");
 		gpuget(1, sizeof(complex<double>), v + pozice, &ret);
 	}
@@ -991,7 +991,7 @@ NRMat<T> & NRMat<T>::operator|=(const NRMat<T> &rhs) {
  * @see NRMat<T>::count, NRMat<T>::operator|=() 
  ******************************************************************************/
 template <typename T>
-void NRMat<T>::copyonwrite() {
+void NRMat<T>::copyonwrite(bool detachonly) {
 	if(!count) laerror("attempt to call copyonwrite() for a matrix with count == 0");
 	if(*count > 1){
 		(*count)--;
@@ -1002,20 +1002,20 @@ void NRMat<T>::copyonwrite() {
 #endif
 		#ifdef MATPTR
 			T **newv = new T*[nn];
-			newv[0] = new T[mm*nn];
-			memcpy(newv[0], v[0], mm*nn*sizeof(T));
+			newv[0] = new T[(size_t)mm*nn];
+			if(!detachonly) memcpy(newv[0], v[0], (size_t)mm*nn*sizeof(T));
 			v = newv;
 			for(register int i=1; i<nn; i++) v[i] = v[i-1] + mm;
 		#else
-			T *newv = new T[mm*nn];
-			memcpy(newv, v, mm*nn*sizeof(T));
+			T *newv = new T[(size_t)mm*nn];
+			if(!detachonly) memcpy(newv, v, (size_t)mm*nn*sizeof(T));
 			v = newv;
 		#endif
 #ifdef CUDALA
 		}else{ //matrix is in GPU memory
-			T *newv = (T *) gpualloc(mm*nn*sizeof(T));
+			T *newv = (T *) gpualloc((size_t)mm*nn*sizeof(T));
 			if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem");
-			cublasScopy(nn*mm*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
+			if(!detachonly) cublasScopy(nn*mm*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
 			TEST_CUBLAS("cublasScopy");
 			v = newv;
 		}
@@ -1082,14 +1082,14 @@ void NRMat<T>::resize(int n, int m) {
 #endif
 		#ifdef MATPTR
 			v = new T*[nn];
-			v[0] = new T[m*n];
+			v[0] = new T[(size_t)m*n];
 			for (register int i=1; i< n; i++) v[i] = v[i-1] + m;
 		#else
-			v = new T[m*n];
+			v = new T[(size_t)m*n];
 		#endif
 #ifdef CUDALA
 		}else{
-			v = (T *) gpualloc(n*m*sizeof(T));
+			v = (T *) gpualloc((size_t)n*m*sizeof(T));
 		}
 #endif
                 return;
@@ -1108,15 +1108,15 @@ void NRMat<T>::resize(int n, int m) {
 			delete[] v;
 		#ifdef MATPTR
 			v = new T*[nn];
-			v[0] = new T[m*n];
+			v[0] = new T[(size_t)m*n];
 			for (int i=1; i< n; i++) v[i] = v[i-1] + m;
 		#else
-			v = new T[m*n];
+			v = new T[(size_t)m*n];
 		#endif
 #ifdef CUDALA
 		}else{
 			gpufree(v);
-			v=(T *) gpualloc(n*m*sizeof(T));
+			v=(T *) gpualloc((size_t)n*m*sizeof(T));
 		}
 #endif
 	}
@@ -1228,7 +1228,7 @@ public:
 	#ifdef MATPTR
 		return NRMat<T>::v[i - 1][j - 1];
 	#else
-		return NRMat<T>::v[(i-1)*NRMat<T>::mm+j-1];
+		return NRMat<T>::v[(i-1)*(size_t)NRMat<T>::mm+j-1];
 	#endif
 	}
 
@@ -1258,11 +1258,11 @@ public:
 	#ifdef MATPTR
 			return NRMat<T>::v[i - 1][j - 1];
 	#else
-			return NRMat<T>::v[(i-1)*NRMat<T>::mm + (j-1)];
+			return NRMat<T>::v[(size_t)(i-1)*NRMat<T>::mm + (j-1)];
 	#endif
 	#ifdef CUDALA
 		}else{
-			const int pozice = (i-1)*NRMat<T>::mm + (j-1);
+			const size_t pozice = (size_t)(i-1)*NRMat<T>::mm + (j-1);
 			gpuget(1, sizeof(T), NRMat<T>::v + pozice, &ret);
 			return ret;
 		}
@@ -1286,10 +1286,10 @@ NRMat<T>& NRMat<T>::operator^=(const NRMat<T> &rhs){
 
 	copyonwrite();// ensure that *count == 1
 #ifdef MATPTR
-	for (register int i=0; i< nn*mm; i++) v[0][i] *= rhs.v[0][i];
+	for (register size_t i=0; i< (size_t)nn*mm; i++) v[0][i] *= rhs.v[0][i];
 #else
-	const int Dim = nn*mm;
-	for(register int i=0; i<Dim; i++) v[i] *= rhs.v[i];
+	const size_t Dim = (size_t)nn*mm;
+	for(register size_t i=0; i<Dim; i++) v[i] *= rhs.v[i];
 #endif
 	return *this;
 }
@@ -1320,14 +1320,14 @@ void NRMat<T>::moveto(const GPUID dest) {
 	T *vold = v;
 
 	if(dest == cpu){ //moving from GPU to CPU
-		v = new T[nn*mm];
-		gpuget(nn*mm, sizeof(T), vold, v);
+		v = new T[(size_t)nn*mm];
+		gpuget((size_t)nn*mm, sizeof(T), vold, v);
 		if(*count == 1){ gpufree(vold); }
 		else{ --(*count); count = new int(1); }
 
 	}else{ //moving from CPU to GPU
-		v = (T *) gpualloc(nn*mm*sizeof(T));
-		gpuput(nn*mm, sizeof(T), vold, v);
+		v = (T *) gpualloc((size_t)nn*mm*sizeof(T));
+		gpuput((size_t)nn*mm, sizeof(T), vold, v);
 		if(*count == 1) delete[] vold;
 		else{ --(*count); count = new int(1);}
 	}
@@ -1351,3 +1351,4 @@ NRVECMAT_OPER2(Mat, -)
 
 }//end of the LA-namespace
 #endif/* _LA_MAT_H_ */
+
diff --git a/noncblas.cc b/noncblas.cc
index 6cb958d..923be5b 100644
--- a/noncblas.cc
+++ b/noncblas.cc
@@ -195,13 +195,14 @@ void cblas_dspmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
 {
 	if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
 	if(Uplo!=CblasLower) laerror("CblasLower uplo asserted");
+	char U = BLAS_FORTRANCASE('u');
 #ifdef FORINT
 	const FINT ntmp=N;
 	const FINT incxtmp=incX;
 	const FINT incytmp=incY;
-	FORNAME(dspmv) ("U",&ntmp, &alpha, Ap, X, &incxtmp, &beta, Y, &incytmp);
+	FORNAME(dspmv) (&U,&ntmp, &alpha, Ap, X, &incxtmp, &beta, Y, &incytmp);
 #else
-	FORNAME(dspmv) ("U",&N, &alpha, Ap, X, &incX, &beta, Y, &incY);
+	FORNAME(dspmv) (&U,&N, &alpha, Ap, X, &incX, &beta, Y, &incY);
 #endif
 }
 
@@ -214,13 +215,14 @@ void cblas_zhpmv(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo,
 {
 	if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
 	if(Uplo!=CblasLower) laerror("CblasLower uplo asserted");
+	char U = BLAS_FORTRANCASE('u');
 #ifdef FORINT
 	const FINT ntmp=N;
 	const FINT incxtmp=incX;
 	const FINT incytmp=incY;
-	FORNAME(zhpmv) ("U",&ntmp, alpha, Ap, X, &incxtmp, beta, Y, &incytmp);
+	FORNAME(zhpmv) (&U,&ntmp, alpha, Ap, X, &incxtmp, beta, Y, &incytmp);
 #else
-	FORNAME(zhpmv) ("U",&N, alpha, Ap, X, &incX, beta, Y, &incY);
+	FORNAME(zhpmv) (&U,&N, alpha, Ap, X, &incX, beta, Y, &incY);
 #endif
 }
 
@@ -298,6 +300,8 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
 {
 	if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
 	//swap a-b, m-n
+	char transb = BLAS_FORTRANCASE(TransB==CblasNoTrans?'N':'T');
+	char transa = BLAS_FORTRANCASE(TransA==CblasNoTrans?'N':'T');
 #ifdef FORINT
 	const FINT mtmp=M;
 	const FINT ntmp=N;
@@ -305,10 +309,10 @@ void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
 	const FINT ldatmp=lda;
 	const FINT ldbtmp=ldb;
 	const FINT ldctmp=ldc;
-	FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T",
+	FORNAME(dgemm) (&transb,&transa,
 			&ntmp, &mtmp, &ktmp, &alpha, B, &ldbtmp, A, &ldatmp, &beta, C, &ldctmp);
 #else
-	FORNAME(dgemm) (TransB==CblasNoTrans?"N":"T", TransA==CblasNoTrans?"N":"T",
+	FORNAME(dgemm) (&transb,&transa,
 			&N, &M, &K, &alpha, B, &ldb, A, &lda, &beta, C, &ldc);
 #endif
 }
@@ -322,6 +326,8 @@ void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
 {
 	if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
 	//swap a-b, m-n
+	char transb =  BLAS_FORTRANCASE(TransB==CblasConjTrans?'C':(TransB==CblasNoTrans?'N':'T'));
+	char transa =  BLAS_FORTRANCASE(TransA==CblasConjTrans?'C':(TransA==CblasNoTrans?'N':'T'));
 #ifdef FORINT
 	const FINT mtmp=M;
 	const FINT ntmp=N;
@@ -329,12 +335,10 @@ void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA
 	const FINT ldatmp=lda;
 	const FINT ldbtmp=ldb;
 	const FINT ldctmp=ldc;
-	FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
-			TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
+	FORNAME(zgemm) (&transb,&transa,
 			&ntmp, &mtmp, &ktmp, alpha, B, &ldbtmp, A, &ldatmp, beta, C, &ldctmp);
 #else
-	FORNAME(zgemm) ( TransB==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
-			TransA==CblasConjTrans?"C":(TransB==CblasNoTrans?"N":"T"),
+	FORNAME(zgemm) ( &transb,&transa,
 			&N, &M, &K, alpha, B, &ldb, A, &lda, beta, C, &ldc);
 #endif
 }
@@ -347,19 +351,21 @@ void cblas_dgemv(const enum CBLAS_ORDER Order,
 		const double *X, const int incX, const double beta,
 		double *Y, const int incY)
 {
+	char transa =  BLAS_FORTRANCASE(TransA==CblasNoTrans?'N':'T');
+	char transax =  BLAS_FORTRANCASE(TransA==CblasNoTrans?'T':'N');
 #ifdef FORINT
 	const FINT mtmp=M;
 	const FINT ntmp=N;
 	const FINT ldatmp=lda;
 	const FINT incxtmp=incX;
 	const FINT incytmp=incY;
-	if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
+	if(Order!=CblasRowMajor) FORNAME(dgemv) (&transa, &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
 	//swap n-m and toggle transposition
-	else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
+	else FORNAME(dgemv) (&transax, &ntmp, &mtmp, &alpha, A, &ldatmp, X, &incxtmp, &beta, Y, &incytmp );
 #else
-	if(Order!=CblasRowMajor) FORNAME(dgemv) (TransA==CblasNoTrans?"N":"T", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
+	if(Order!=CblasRowMajor) FORNAME(dgemv) (&transa, &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
 	//swap n-m and toggle transposition
-	else FORNAME(dgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
+	else FORNAME(dgemv) (&transax, &N, &M, &alpha, A, &lda, X, &incX, &beta, Y, &incY );
 #endif
 }
 
@@ -374,15 +380,16 @@ void cblas_zgemv(const enum CBLAS_ORDER Order,
 	if(Order!=CblasRowMajor) laerror("CblasRowMajor order asserted");
 	if(TransA == CblasConjTrans) laerror("zgemv with CblasConjTrans not supportted");
 	//swap n-m and toggle transposition
+	char transa =  BLAS_FORTRANCASE(TransA==CblasNoTrans?'T':'N');
 #ifdef FORINT
 	const FINT mtmp=M;
 	const FINT ntmp=N;
 	const FINT ldatmp=lda;
 	const FINT incxtmp=incX;
 	const FINT incytmp=incY;
-	FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &ntmp, &mtmp, alpha, A, &ldatmp, X, &incxtmp, beta, Y, &incytmp );
+	FORNAME(zgemv) (&transa, &ntmp, &mtmp, alpha, A, &ldatmp, X, &incxtmp, beta, Y, &incytmp );
 #else
-	FORNAME(zgemv) (TransA==CblasNoTrans?"T":"N", &N, &M, alpha, A, &lda, X, &incX, beta, Y, &incY );
+	FORNAME(zgemv) (&transa, &N, &M, alpha, A, &lda, X, &incX, beta, Y, &incY );
 #endif
 }
 
diff --git a/nonclass.cc b/nonclass.cc
index 1804cec..f22e7ed 100644
--- a/nonclass.cc
+++ b/nonclass.cc
@@ -17,6 +17,7 @@
 */
 
 //this can be safely included since it contains ifdefs NONCBLAS and NONCLAPACK anyway
+#include "la_traits.h"
 #include "noncblas.h"
 #include "vec.h"
 #include "smat.h"
@@ -196,7 +197,7 @@ static void linear_solve_do(NRSMat<double> &a, double *b, const int nrhs, const
 	FINT r, *ipiv;
 	a.copyonwrite();
 	ipiv = new FINT[n];
-	char U = 'U';
+	char U = LAPACK_FORTRANCASE('u');
 #ifdef FORINT
         const FINT ntmp=n;
         const FINT nrhstmp=nrhs;
@@ -298,9 +299,9 @@ int linear_solve_x(NRMat<double> &_A, double *_B, const int _rhsCount, const int
 	const int A_rows = _A.nrows();
 	const int A_cols = _A.ncols();
 
-	const char fact  = _eq?'E':'N';
-	const char trans = 'T';//because of c-order
-	char equed = 'B';//if fact=='N' then equed is an output argument, therefore not declared as const
+	const char fact  = LAPACK_FORTRANCASE(_eq?'E':'N');
+	const char trans = LAPACK_FORTRANCASE('T');//because of c-order
+	char equed = LAPACK_FORTRANCASE('B');//if fact=='N' then equed is an output argument, therefore not declared as const
 
 	if(_eqCount < 0 || _eqCount > A_rows || _eqCount > A_cols || _rhsCount < 0){
 		laerror("linear_solve_x: invalid input matrices");
@@ -371,9 +372,9 @@ int linear_solve_x(NRMat<complex<double> > &_A, complex<double> *_B, const int _
 	const int A_rows = _A.nrows();
 	const int A_cols = _A.ncols();
 
-	const char fact  = _eq?'E':'N';
-	const char trans = 'T';//because of c-order
-	char equed = 'B';//if fact=='N' then equed is an output argument, therefore not declared as const
+	const char fact  = LAPACK_FORTRANCASE(_eq?'E':'N');
+	const char trans = LAPACK_FORTRANCASE('T');//because of c-order
+	char equed = LAPACK_FORTRANCASE('B');//if fact=='N' then equed is an output argument, therefore not declared as const
 
 	if(_eqCount < 0 || _eqCount > A_rows || _eqCount > A_cols || _rhsCount < 0){
 		laerror("linear_solve_x: invalid input matrices");
@@ -557,9 +558,9 @@ void diagonalize(NRMat<double> &a, NRVec<double> &w, const bool eivec,
 	if(b) b->copyonwrite();
 
 	FINT r = 0;
-	char U ='U';
-	char vectors = 'V';
-	if (!eivec) vectors = 'N';
+	char U =LAPACK_FORTRANCASE('u');
+	char vectors = LAPACK_FORTRANCASE('v');
+	if (!eivec) vectors = LAPACK_FORTRANCASE('n');
 	FINT LWORK = -1;
 	double WORKX;
 	FINT ldb=0; if(b) ldb=b->ncols();
@@ -588,7 +589,7 @@ void diagonalize(NRMat<double> &a, NRVec<double> &w, const bool eivec,
 #endif
 
         delete[] WORK;
-	if (vectors == 'V' && corder) a.transposeme(n);
+	if (LAPACK_FORTRANCASE(vectors) == LAPACK_FORTRANCASE('v') && corder) a.transposeme(n);
 
 	if (r < 0) laerror("illegal argument in sygv/syev in diagonalize()");
 	if (r > 0) laerror("convergence problem in sygv/syev in diagonalize()");
@@ -620,12 +621,13 @@ void diagonalize(NRMat<complex<double> > &a, NRVec<double> &w, const bool eivec,
 	if(b) b->copyonwrite();
 
 	FINT r = 0;
-	char U ='U';
-	char vectors = 'V';
-	if (!eivec) vectors = 'N';
+	char U =LAPACK_FORTRANCASE('U');
+	char vectors = LAPACK_FORTRANCASE('V');
+	if (!eivec) vectors = LAPACK_FORTRANCASE('n');
 	FINT LWORK = -1;
 	complex<double> WORKX;
 	FINT ldb=0; if(b) ldb=b->ncols();
+std::cout << "test vectors "<<vectors<<std::endl;
 
 	// First call is to determine size of workspace
 	double *RWORK = new double[3*n+2]; 
@@ -652,7 +654,7 @@ void diagonalize(NRMat<complex<double> > &a, NRVec<double> &w, const bool eivec,
 
         delete[] WORK;
 	delete[] RWORK;
-	if (vectors == 'V' && corder) a.transposeme(n);
+	if (LAPACK_FORTRANCASE(vectors) == LAPACK_FORTRANCASE('v') && corder) {a.transposeme(n); a.conjugateme();}
 
 	if (r < 0) laerror("illegal argument in hegv/heev in diagonalize()");
 	if (r > 0) laerror("convergence problem in hegv/heev in diagonalize()");
@@ -684,8 +686,8 @@ void diagonalize(NRSMat<double> &a, NRVec<double> &w, NRMat<double> *v,
 	if(b) b->copyonwrite();
 
 	FINT r = 0;
-	char U = 'U';
-	char job = v ? 'v' : 'n';
+	char U = LAPACK_FORTRANCASE('u');
+	char job = LAPACK_FORTRANCASE(v ? 'v' : 'n');
 
 	double *WORK = new double[3*n];
 	FINT ldv=v?v->ncols():n;
@@ -730,8 +732,8 @@ void diagonalize(NRSMat<complex<double> > &a, NRVec<double> &w, NRMat<complex<do
 	if(b) b->copyonwrite();
 
 	FINT r = 0;
-	char U = 'U';
-	char job = v ? 'v' : 'n';
+	char U = LAPACK_FORTRANCASE('u');
+	char job = LAPACK_FORTRANCASE(v ? 'v' : 'n');
 
 	complex<double> *WORK = new complex<double>[2*n];
 	double *RWORK  = new double[3*n];
@@ -890,8 +892,8 @@ void gdiagonalize(NRMat<double> &a, NRVec<double> &wr, NRVec<double> &wi,
 		const int sorttype, const int biorthonormalize,
 		NRMat<double> *b, NRVec<double> *beta)
 {
-	if(n<=0) n = a.nrows();
-	if (n > a.ncols() || n>a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");
+	if(n<=0) {n = a.nrows(); if(a.ncols()!=a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");}
+	if (n > a.ncols() || n>a.nrows() ) laerror("gdiagonalize() of too big submatrix");
 	if (n > wr.size()) 
 		laerror("inconsistent dimension of eigen vector in gdiagonalize()");
 	if (vl) if (n > vl->nrows() || n > vl->ncols())
@@ -911,8 +913,8 @@ void gdiagonalize(NRMat<double> &a, NRVec<double> &wr, NRVec<double> &wi,
 	if (beta) beta->copyonwrite();
 	if (b) b->copyonwrite();
 	
-	char jobvl = vl ? 'V' : 'N';
-	char jobvr = vr ? 'V' : 'N';
+	char jobvl = LAPACK_FORTRANCASE(vl ? 'v' : 'n');
+	char jobvr = LAPACK_FORTRANCASE(vr ? 'v' : 'n');
 	double work0;
 	FINT lwork = -1;
 	FINT r;
@@ -1055,8 +1057,8 @@ void gdiagonalize(NRMat<complex<double> > &a, NRVec< complex<double> > &w,
 		NRMat<complex<double> > *b, NRVec<complex<double> > *beta)
 {
 
-	if(n<=0) n = a.nrows();
-	if (n > a.ncols() || n>a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");
+	if(n<=0) {n = a.nrows(); if(a.ncols()!=a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");}
+	if (n > a.ncols() || n>a.nrows() ) laerror("gdiagonalize() of too big submatrix");
 	if (n > w.size()) 
 		laerror("inconsistent dimension of eigen vector in gdiagonalize()");
 	if (vl) if (n > vl->nrows() || n > vl->ncols())
@@ -1075,8 +1077,8 @@ void gdiagonalize(NRMat<complex<double> > &a, NRVec< complex<double> > &w,
 	if (beta) beta->copyonwrite();
 	if (b) b->copyonwrite();
 	
-	char jobvl = vl ? 'V' : 'N';
-	char jobvr = vr ? 'V' : 'N';
+	char jobvl = LAPACK_FORTRANCASE(vl ? 'v' : 'n');
+	char jobvr = LAPACK_FORTRANCASE(vr ? 'v' : 'n');
 	complex<double> work0;
 	FINT lwork = -1;
 	FINT r;
@@ -1146,8 +1148,8 @@ void gdiagonalize(NRMat<complex<double> > &a, NRVec< complex<double> > &w,
 
 
 	if (corder) {
-		if (vl) vl->transposeme(n);
-		if (vr) vr->transposeme(n);
+		if (vl) {vl->transposeme(n); vl->conjugateme();}
+		if (vr) {vr->transposeme(n); vr->conjugateme();}
 	}
 
 }
@@ -1159,8 +1161,8 @@ void gdiagonalize(NRMat<double> &a, NRVec< complex<double> > &w,
 		const bool corder, int n, const int sorttype, const int biorthonormalize,
 		NRMat<double> *b, NRVec<double> *beta)
 {
-	if(n<=0)  n = a.nrows();
-	if(n> a.nrows() || n ==  a.nrows() && n != a.ncols()) laerror("gdiagonalize() call for a non-square matrix");
+	if(n<=0) {n = a.nrows(); if(a.ncols()!=a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");}
+	if(n> a.nrows() || n ==  a.nrows() && n != a.ncols()) laerror("gdiagonalize() of too big submatrix");
 
 	NRVec<double> wr(n), wi(n);
 	NRMat<double> *rvl = 0;
@@ -1226,10 +1228,12 @@ void gdiagonalize(NRMat<double> &a, NRVec< complex<double> > &w,
 template<>
 const NRMat<double> realpart<NRMat< complex<double> > >(const NRMat< complex<double> > &a)
 {
+        NRMat<double> result(a.nrows(), a.ncols());
+
 #ifdef CUDALA
-        if(location == cpu){
+        if(a.location == cpu){
 #endif
-	NRMat<double> result(a.nrows(), a.ncols());
+//	NRMat<double> result(a.nrows(), a.ncols());
 	cblas_dcopy(a.nrows()*a.ncols(), (const double *)a[0], 2, result, 1);
 #ifdef CUDALA
         }else{
@@ -1242,11 +1246,13 @@ const NRMat<double> realpart<NRMat< complex<double> > >(const NRMat< complex<dou
 template<>
 const NRMat<double> imagpart<NRMat< complex<double> > >(const NRMat< complex<double> > &a)
 {
+        NRMat<double> result(a.nrows(), a.ncols());
+
 #ifdef CUDALA
-        if(location == cpu){
+        if(a.location == cpu){
 #endif
 
-	NRMat<double> result(a.nrows(), a.ncols());
+//	NRMat<double> result(a.nrows(), a.ncols());
 	cblas_dcopy(a.nrows()*a.ncols(), (const double *)a[0]+1, 2, result, 1);
 #ifdef CUDALA
         }else{
@@ -1259,12 +1265,15 @@ const NRMat<double> imagpart<NRMat< complex<double> > >(const NRMat< complex<dou
 template<>
 const NRMat< complex<double> > realmatrix<NRMat<double> > (const NRMat<double> &a)
 {
+
+        NRMat <complex<double> > result(a.nrows(), a.ncols());
+
 #ifdef CUDALA
-        if(location == cpu){
+        if(a.location == cpu){
 #endif
 
 
-	NRMat <complex<double> > result(a.nrows(), a.ncols());
+//	NRMat <complex<double> > result(a.nrows(), a.ncols());
 	cblas_dcopy(a.nrows()*a.ncols(), a, 1, (double *)result[0], 2);
 #ifdef CUDALA
         }else{
@@ -1278,11 +1287,13 @@ const NRMat< complex<double> > realmatrix<NRMat<double> > (const NRMat<double> &
 template<>
 const NRMat< complex<double> > imagmatrix<NRMat<double> > (const NRMat<double> &a)
 {
+        NRMat< complex<double> > result(a.nrows(), a.ncols());
+
 #ifdef CUDALA
-        if(location == cpu){
+        if(a.location == cpu){
 #endif
 
-	NRMat< complex<double> > result(a.nrows(), a.ncols());
+//	NRMat< complex<double> > result(a.nrows(), a.ncols());
 	cblas_dcopy(a.nrows()*a.ncols(), a, 1, (double *)result[0]+1, 2);
 #ifdef CUDALA
         }else{
@@ -1577,7 +1588,7 @@ void cholesky(NRMat<double> &a, bool upper)
 if(a.nrows()!=a.ncols()) laerror("matrix must be square in Cholesky");
 FINT lda=a.ncols();
 FINT n=a.nrows();
-char uplo=upper?'U':'L';
+char uplo= LAPACK_FORTRANCASE(upper?'u':'l');
 FINT info;
 a.copyonwrite();
 FORNAME(dpotrf)(&uplo, &n, a, &lda, &info);
@@ -1596,7 +1607,7 @@ void cholesky(NRMat<complex<double> > &a, bool upper)
 if(a.nrows()!=a.ncols()) laerror("matrix must be square in Cholesky");
 FINT lda=a.ncols();
 FINT n=a.nrows();
-char uplo=upper?'U':'L';
+char uplo= LAPACK_FORTRANCASE(upper?'u':'l');
 FINT info;
 a.copyonwrite();
 a.transposeme();//switch to Fortran order
@@ -1788,3 +1799,4 @@ if(nchange&1)//still adjust to get determinant=1
 }
 
 }//namespace
+
diff --git a/smat.cc b/smat.cc
index e8b62f4..2b57aec 100644
--- a/smat.cc
+++ b/smat.cc
@@ -28,11 +28,8 @@
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <errno.h>
+#include <unistd.h>
 
-extern "C" {
-	extern ssize_t read(int, void *, size_t);
-	extern ssize_t write(int, const void *, size_t);
-}
 
 namespace LA {
 
@@ -130,7 +127,7 @@ NRSMat<T> & NRSMat<T>::operator=(const T &a) {
 	NOT_GPU(*this);
 	copyonwrite();
 	memset(v, 0, NN2*sizeof(T));
-	for(register int i=0; i<nn; i++) v[i*(i+1)/2 + i] = a;
+	for(register int i=0; i<nn; i++) v[(size_t)i*(i+1)/2 + i] = a;
 	return *this;
 }
 
@@ -157,11 +154,11 @@ const T* NRSMat<T>::diagonalof(NRVec<T> &r, const bool divide, bool cache) const
 
 	if(divide){
 		for(register int i=0; i<nn; i++){
-			const T a = v[i*(i+1)/2+i];
+			const T a = v[(size_t)i*(i+1)/2+i];
 			if(a != 0.) r[i] /= a;
 		}
 	}else{
-	        for(register int i=0; i<nn; i++) r[i] = v[i*(i+1)/2+i];
+	        for(register int i=0; i<nn; i++) r[i] = v[(size_t)i*(i+1)/2+i];
 	}
 
 	return divide?NULL:&r[0];
@@ -178,7 +175,7 @@ const NRSMat<T> NRSMat<T>::operator-() const {
 	NOT_GPU(*this);
 
 	NRSMat<T> result(nn, getlocation());
-	for(register int i = 0; i<NN2; i++) result.v[i]= -v[i];
+	for(register size_t i = 0; i<NN2; i++) result.v[i]= -v[i];
 	return result;
 }
 
@@ -239,7 +236,7 @@ const T NRSMat<T>::trace() const {
 	NOT_GPU(*this);
 
 	T tmp = 0;
-	for(register int i=0; i<nn; i++) tmp += v[i*(i+1)/2+i];
+	for(register int i=0; i<nn; i++) tmp += v[(size_t)i*(i+1)/2+i];
 	return tmp;
 }
 
@@ -251,7 +248,7 @@ template<>
 void NRSMat<double>::randomize(const double &x) {
 	NOT_GPU(*this);
 
-	for(int i=0; i<NN2; ++i){
+	for(size_t i=0; i<NN2; ++i){
 		v[i] = x*(2.*random()/(1.+RAND_MAX) -1.);
 	}
 }
@@ -262,11 +259,11 @@ void NRSMat<double>::randomize(const double &x) {
  ******************************************************************************/
 template<>
 void NRSMat<complex<double> >::randomize(const double &x) {
-	for(register int i=0; i<NN2; ++i) v[i].real() = x*(2.*random()/(1. + RAND_MAX) -1.);
-	for(register int i=0; i<NN2; ++i) v[i].imag() = x*(2.*random()/(1. + RAND_MAX) -1.);
+	for(register size_t i=0; i<NN2; ++i) v[i].real() = x*(2.*random()/(1. + RAND_MAX) -1.);
+	for(register size_t i=0; i<NN2; ++i) v[i].imag() = x*(2.*random()/(1. + RAND_MAX) -1.);
 	for(register int i=0; i<nn; ++i){
 		for(register int j=0; j<=i; ++j){
-		       	if(i == j) v[i*(i+1)/2+j].imag() = 0; //hermitean
+		       	if(i == j) v[i*(size_t)(i+1)/2+j].imag() = 0; //hermitean
 		}
 	}
 }
diff --git a/smat.h b/smat.h
index 1427e18..38afed9 100644
--- a/smat.h
+++ b/smat.h
@@ -25,7 +25,7 @@
 #include "la_traits.h"
 
 namespace LA {
-#define NN2 (nn*(nn+1)/2)
+#define NN2 ((size_t)nn*(nn+1)/2)
 
 
 /***************************************************************************//**
@@ -134,15 +134,15 @@ public:
 	void gemv(const T beta, NRVec<T> &r, const char trans, const T alpha, const NRVec<T> &x) const {r.gemv(beta,*this,trans,alpha,x);};
 	void gemv(const T beta, NRVec<complex<T> > &r, const char trans, const T alpha, const NRVec<complex<T> > &x) const {r.gemv(beta,*this,trans,alpha,x);};
 
-	inline const T& operator[](const int ij) const;
-	inline T& operator[](const int ij);
+	inline const T& operator[](const size_t ij) const;
+	inline T& operator[](const size_t ij);
 
 	inline const T& operator()(const int i, const int j) const;
 	inline T& operator()(const int i, const int j);
 
 	inline int nrows() const;
 	inline int ncols() const;
-	inline int size() const;
+	inline size_t size() const;
 
 	inline bool transp(const int i, const int j) const {return i>j;} //this can be used for compact storage of matrices, which are actually not symmetric, but one triangle of them is redundant
 	const typename LA_traits<T>::normtype norm(const T scalar = (T)0) const;
@@ -155,9 +155,9 @@ public:
 	void get(int fd, bool dimensions = 1, bool transp = 0);
         void put(int fd, bool dimensions = 1, bool transp = 0) const;
 
-	void copyonwrite();
+	void copyonwrite(bool detachonly=false);
 
-	void clear() {copyonwrite(); LA_traits<T>::clear(v,NN2);}; //zero out
+	void clear() {copyonwrite(true); LA_traits<T>::clear(v,NN2);}; //zero out
 	void resize(const int n);
 	void dealloc(void) {resize(0);}
 
@@ -212,7 +212,7 @@ inline NRSMat<T>::NRSMat(const T& a, const int n) : nn(n), count(new int(1)) {
 	if(location == cpu){
 #endif
 		v = new T[NN2];
-		if(a != (T)0) for(register int i = 0; i<NN2; i++) v[i] = a;
+		if(a != (T)0) for(register size_t i = 0; i<NN2; i++) v[i] = a;
 		else memset(v, 0, NN2*sizeof(T));
 
 #ifdef CUDALA
@@ -338,7 +338,7 @@ inline NRSMat<T> & NRSMat<T>::operator*=(const T &a) {
 	NOT_GPU(*this);
 
 	copyonwrite();
-	for(register int i = 0; i<NN2; ++i) v[i] *= a;
+	for(register size_t i = 0; i<NN2; ++i) v[i] *= a;
         return *this;
 }
 
@@ -353,7 +353,7 @@ inline NRSMat<T> & NRSMat<T>::operator+=(const T &a) {
 	NOT_GPU(*this);
 
 	copyonwrite();
-	for(register int i = 0; i<nn; i++) v[i*(i+1)/2 + i] += a;
+	for(register int i = 0; i<nn; i++) v[i*(size_t)(i+1)/2 + i] += a;
 	return *this;
 }
 
@@ -368,7 +368,7 @@ inline NRSMat<T> & NRSMat<T>::operator-=(const T &a) {
 	NOT_GPU(*this);
 
 	copyonwrite();
-	for(register int i = 0; i<nn; i++) v[i*(i+1)/2+i] -= a;
+	for(register int i = 0; i<nn; i++) v[i*(size_t)(i+1)/2+i] -= a;
 	return *this;
 }
 
@@ -438,7 +438,7 @@ inline NRSMat<T>& NRSMat<T>::operator+=(const NRSMat<T>& rhs) {
 	SAME_LOC(*this, rhs);
 
         copyonwrite();
-	for(register int i = 0; i<NN2; ++i) v[i] += rhs.v[i];
+	for(register size_t i = 0; i<NN2; ++i) v[i] += rhs.v[i];
         return *this;
 }
 
@@ -507,7 +507,7 @@ inline NRSMat<T>& NRSMat<T>::operator-=(const NRSMat<T>& rhs) {
 	NOT_GPU(*this);
         copyonwrite();
 
-        for(register int i = 0; i<NN2; ++i) v[i] -= rhs.v[i];
+        for(register size_t i = 0; i<NN2; ++i) v[i] -= rhs.v[i];
         return *this;
 }
 
@@ -540,7 +540,7 @@ inline const NRMat<T> NRSMat<T>::operator-(const NRMat<T> &rhs) const {
  * @return reference to the corresponding matrix element
  ******************************************************************************/
 template <typename T>
-inline T& NRSMat<T>::operator[](const int ij) {
+inline T& NRSMat<T>::operator[](const size_t ij) {
 #ifdef DEBUG
 	if(_LA_count_check && *count != 1) laerror("T& NRSMat<T>::operator[] used for matrix with count>1");
 	if(ij<0 || ij>=NN2) laerror("T& NRSMat<T>::operator[] out of range");
@@ -560,7 +560,7 @@ inline T& NRSMat<T>::operator[](const int ij) {
  * @return constant reference to the corresponding matrix element
  ******************************************************************************/
 template <typename T>
-inline const T & NRSMat<T>::operator[](const int ij) const {
+inline const T & NRSMat<T>::operator[](const size_t ij) const {
 #ifdef DEBUG
 	if(ij<0 || ij>=NN2) laerror("T& NRSMat<T>::operator[] out of range");
 	if(!v) laerror("T& NRSMat<T>::operator[] used for unallocated NRSmat<T> object");
@@ -578,8 +578,8 @@ inline const T & NRSMat<T>::operator[](const int ij) const {
  * @return cumulative index
  ******************************************************************************/
 template<typename T>
-inline T SMat_index(T i, T j) {
-	return (i>=j) ? i*(i+1)/2+j : j*(j+1)/2+i;
+inline size_t SMat_index(T i, T j) {
+	return (i>=j) ? i*(size_t)(i+1)/2+j : j*(size_t)(j+1)/2+i;
 }
 
 /***************************************************************************//**
@@ -590,8 +590,8 @@ inline T SMat_index(T i, T j) {
  * @return cumulative index
  ******************************************************************************/
 template<typename T>
-inline T SMat_index_igej(T i, T j) {
-	return i*(i+1)/2+j;
+inline size_t SMat_index_igej(T i, T j) {
+	return i*(size_t)(i+1)/2+j;
 }
 
 /***************************************************************************//**
@@ -602,8 +602,8 @@ inline T SMat_index_igej(T i, T j) {
  * @return cumulative index
  ******************************************************************************/
 template<typename T>
-inline T SMat_index_ilej(T i, T j) {
-	return j*(j+1)/2+i;
+inline size_t SMat_index_ilej(T i, T j) {
+	return j*(size_t)(j+1)/2+i;
 }
 
 /***************************************************************************//**
@@ -614,8 +614,8 @@ inline T SMat_index_ilej(T i, T j) {
  * @return cumulative index
  ******************************************************************************/
 template<typename T>
-inline T SMat_index_1(T i, T j) {
-	return (i>=j)? i*(i-1)/2+j-1 : j*(j-1)/2+i-1;
+inline size_t SMat_index_1(T i, T j) {
+	return (i>=j)? i*(size_t)(i-1)/2+j-1 : j*(size_t)(j-1)/2+i-1;
 }
 
 /***************************************************************************//**
@@ -626,8 +626,8 @@ inline T SMat_index_1(T i, T j) {
  * @return cumulative index
  ******************************************************************************/
 template<typename T>
-inline T SMat_index_1igej(T i, T j) {
-	return i*(i-1)/2+j-1;
+inline size_t SMat_index_1igej(T i, T j) {
+	return i*(size_t)(i-1)/2+j-1;
 }
 
 /***************************************************************************//**
@@ -638,21 +638,21 @@ inline T SMat_index_1igej(T i, T j) {
  * @return cumulative index
  ******************************************************************************/
 template<typename T>
-inline T SMat_index_1ilej(T i, T j) {
-	return j*(j-1)/2+i-1;
+inline size_t SMat_index_1ilej(T i, T j) {
+	return j*(size_t)(j-1)/2+i-1;
 }
 
 //indexing for antisymmetric matrix (used by fourindex)
 
 template<typename T>
-inline T ASMat_index(T i, T j)
+inline size_t ASMat_index(T i, T j)
 {
 if(i == j) return -1;
-return (i>j) ? i*(i-1)/2+j : j*(j-1)/2+i;
+return (i>j) ? i*(size_t)(i-1)/2+j : j*(size_t)(j-1)/2+i;
 }
 
 template<typename T>
-inline T ASMat_index_1(T i, T j)
+inline size_t ASMat_index_1(T i, T j)
 {
 if(i == j) return -1;
 return (i>j)? (i-2)*(i-1)/2+j-1 : (j-2)*(j-1)/2+i-1;
@@ -715,7 +715,7 @@ inline int NRSMat<T>::ncols() const {
  * @return number of elements of this symmetric matrix of generalt type <code>T</code>
  ******************************************************************************/
 template <typename T>
-inline int NRSMat<T>::size() const {
+inline size_t NRSMat<T>::size() const {
         return NN2;
 }
 
@@ -758,7 +758,7 @@ inline const double NRSMat<double>::amin() const {
 		double val(0.0);
 		int index(-1);
 		ret = std::numeric_limits<double>::max();
-		for(register int i = 0; i < NN2; i++){
+		for(register size_t i = 0; i < NN2; i++){
 			val = std::abs(v[i]);
 			if(val < ret){ index = i; ret = val; }
 		}
@@ -812,7 +812,7 @@ inline const complex<double> NRSMat<complex<double> >::amin() const{
 		complex<double> z_val(0.0, 0.0);
 
 		min_val = std::numeric_limits<double>::max();
-		for(register int i = 0; i < NN2; i++){
+		for(register size_t i = 0; i < NN2; i++){
 			z_val = v[i];
 			val = std::abs(z_val.real()) + std::abs(z_val.imag());
 			if(val < min_val){ index = i; min_val = val; }	
@@ -920,7 +920,7 @@ NRSMat<T> & NRSMat<T>::operator=(const NRSMat<T> & rhs) {
  * @see NRSMat<T>::operator|=, NRSMat<T>::copyonwrite()
  ******************************************************************************/
 template <typename T>
-void NRSMat<T>::copyonwrite() {
+void NRSMat<T>::copyonwrite(bool detachonly) {
 	if(!count) laerror("calling NRSMat<T>::copyonwrite() for undefined NRSMat<T> object");
 	if(*count > 1){
 		(*count)--;
@@ -931,12 +931,12 @@ void NRSMat<T>::copyonwrite() {
 		if(location == cpu) {
 #endif
 			newv = new T[NN2];
-			memcpy(newv, v, NN2*sizeof(T));
+			if(!detachonly) memcpy(newv, v, NN2*sizeof(T));
 #ifdef CUDALA
 		}else{
 			newv = (T *) gpualloc(NN2*sizeof(T));
 			if(sizeof(T)%sizeof(float) != 0) laerror("memory alignment problem in NRSMat<T>::copyonwrite()");
-			cublasScopy(NN2*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
+			if(!detachonly) cublasScopy(NN2*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
 			TEST_CUBLAS("cublasScopy");//"NRSMat<T>::copyonwrite()"
 		}
 #endif
diff --git a/sparsemat.h b/sparsemat.h
index b0b7a84..b71f5cf 100644
--- a/sparsemat.h
+++ b/sparsemat.h
@@ -132,8 +132,8 @@ public:
 	void setunsymmetric();//unwind the matrix assuming it was indeed symmetric
 	inline bool issymmetric() const {return symmetric;}
 	unsigned int length() const;
-	void copyonwrite();
-	void clear() {copyonwrite();unsort();deletelist();}
+	void copyonwrite(bool detachonly=false);
+	void clear() {copyonwrite(true); if(count) {delete count; count=NULL;}}
 	void simplify();
 	const T trace() const;
 	const typename LA_traits<T>::normtype norm(const T scalar=(T)0) const; //is const only mathematically, not in internal implementation - we have to simplify first
diff --git a/t.cc b/t.cc
index d47aef9..33ca48a 100644
--- a/t.cc
+++ b/t.cc
@@ -1799,6 +1799,11 @@ for(int i=0; i<m.nrows(); ++i) for(int j=0; j<m.nrows(); ++j)
 	if(i!=j) err += abs(m2(i,j));
 cout <<"offdiagonality error = "<<err<<endl;
 
+err=0; 
+for(int i=0; i<m.nrows(); ++i) err += abs(m2(i,i) - eivals[i]);
+cout <<"eigenvalue error = "<<err<<endl;
+
+
 //test as general matrix
 NRVec<complex<double> > ww(m.nrows());
 NRMat<complex<double> > vl(m.nrows(),m.nrows()), vr(m.nrows(),m.nrows());
@@ -1812,6 +1817,11 @@ for(int i=0; i<m.nrows(); ++i) for(int j=0; j<m.nrows(); ++j)
         if(i!=j) err += abs(m3(i,j));
 cout <<"offdiagonality error 2 = "<<err<<endl;
 
+err=0;
+for(int i=0; i<m.nrows(); ++i) err += abs(m3(i,i) - ww[i]);
+cout <<"eigenvalue error = "<<err<<endl;
+
+
 
 }
 
diff --git a/vec.cc b/vec.cc
index 394829c..4600865 100644
--- a/vec.cc
+++ b/vec.cc
@@ -27,11 +27,8 @@
 #include <errno.h>
 #include "vec.h"
 #include "qsort.h"
+#include <unistd.h>
 
-extern "C" {
-	extern ssize_t read(int, void *, size_t);
-	extern ssize_t write(int, const void *, size_t);
-}
 
 namespace LA {
 
@@ -541,7 +538,7 @@ template<>
 void NRVec<double>::gemv(const double beta, const NRMat<double> &A, 
 		const char trans, const double alpha, const NRVec &x) {
 #ifdef DEBUG
-	if((trans == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
+	if((tolower(trans) == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
 #endif
 	SAME_LOC3(*this, x, A);
 	copyonwrite();
@@ -549,10 +546,10 @@ void NRVec<double>::gemv(const double beta, const NRMat<double> &A,
 #ifdef CUDALA
 	if(location==cpu){
 #endif
-		cblas_dgemv(CblasRowMajor, (trans=='n' ? CblasNoTrans:CblasTrans), A.nrows(), A.ncols(), alpha, A, A.ncols(), x.v, 1, beta, v, 1);
+		cblas_dgemv(CblasRowMajor, (tolower(trans)=='n' ? CblasNoTrans:CblasTrans), A.nrows(), A.ncols(), alpha, A, A.ncols(), x.v, 1, beta, v, 1);
 #ifdef CUDALA
 	}else{
-		cublasDgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), x.v, 1, beta, v, 1);
+		cublasDgemv((tolower(trans)=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), x.v, 1, beta, v, 1);
 		TEST_CUBLAS("cublasDgemv");
 	}
 #endif
@@ -572,7 +569,7 @@ template<>
 void NRVec<complex<double> >::gemv(const double beta, const NRMat<double> &A,
                 const char trans, const double alpha, const NRVec<complex<double> > &x) {
 #ifdef DEBUG
-	if ((trans == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
+	if ((tolower(trans) == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
 #endif
 	SAME_LOC3(*this, x, A);
 	copyonwrite();
@@ -580,16 +577,16 @@ void NRVec<complex<double> >::gemv(const double beta, const NRMat<double> &A,
 #ifdef CUDALA
 	if(location==cpu){
 #endif
-		cblas_dgemv(CblasRowMajor, (trans=='n'?CblasNoTrans:CblasTrans),
+		cblas_dgemv(CblasRowMajor, (tolower(trans)=='n'?CblasNoTrans:CblasTrans),
 				A.nrows(), A.ncols(), alpha, A, A.ncols(), (double *)x.v, 2, beta, (double *)v, 2);
-		cblas_dgemv(CblasRowMajor, (trans=='n'?CblasNoTrans:CblasTrans),
+		cblas_dgemv(CblasRowMajor, (tolower(trans)=='n'?CblasNoTrans:CblasTrans),
 				A.nrows(), A.ncols(), alpha, A, A.ncols(), ((double *)x.v) + 1, 2, beta, ((double *)v)+1, 2);
 #ifdef CUDALA
 	}else{
-		cublasDgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), (double*)(x.v), 2, beta, (double *)v, 2);
+		cublasDgemv((tolower(trans)=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), (double*)(x.v), 2, beta, (double *)v, 2);
 		TEST_CUBLAS("cublasDgemv");
 
-		cublasDgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), ((double *)x.v) + 1, 2, beta, ((double *)v)+1, 2);
+		cublasDgemv((tolower(trans)=='n'?'T':'N'), A.ncols(), A.nrows(), alpha, A, A.ncols(), ((double *)x.v) + 1, 2, beta, ((double *)v)+1, 2);
 		TEST_CUBLAS("cublasDgemv");
 	}
 #endif
@@ -611,14 +608,14 @@ void NRVec<complex<double> >::gemv(const complex<double> beta,
 		const NRMat<complex<double> > &A, const char trans, 
 		const complex<double> alpha, const NRVec<complex<double> > &x) {
 #ifdef DEBUG
-	if ((trans == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
+	if ((tolower(trans) == 'n'?A.ncols():A.nrows()) != x.size()){ laerror("incompatible vectors"); }
 #endif
 	SAME_LOC3(*this, x, A);
 	copyonwrite();
 #ifdef CUDALA
 	if(location == cpu){
 #endif
-		cblas_zgemv(CblasRowMajor, (trans=='n'?CblasNoTrans:CblasTrans), 
+		cblas_zgemv(CblasRowMajor, (tolower(trans)=='n'?CblasNoTrans:CblasTrans), 
 				A.nrows(), A.ncols(), &alpha, A, A.ncols(), x.v, 1, &beta, v, 1);
 #ifdef CUDALA
 	}else{
@@ -626,7 +623,7 @@ void NRVec<complex<double> >::gemv(const complex<double> beta,
 		const cuDoubleComplex _alpha = make_cuDoubleComplex(alpha.real(), alpha.imag());
 		const cuDoubleComplex  _beta = make_cuDoubleComplex(beta.real(), beta.imag());
 
-		cublasZgemv((trans=='n'?'T':'N'), A.ncols(), A.nrows(),
+		cublasZgemv((tolower(trans)=='n'?'T':'N'), A.ncols(), A.nrows(),
 				_alpha, (cuDoubleComplex*)(A[0]), A.ncols(), (cuDoubleComplex*)(x.v), 1, _beta, (cuDoubleComplex*)v, 1);
 		TEST_CUBLAS("cublasZgemv");
 	}
diff --git a/vec.h b/vec.h
index b40ae79..6aa5546 100644
--- a/vec.h
+++ b/vec.h
@@ -149,10 +149,10 @@ public:
 	#endif
 
 	//! create separate copy of the data corresponding to this vector 
-	void copyonwrite();
+	void copyonwrite(bool detachonly=false);
 
 	//! purge this vector 
-	void clear() { copyonwrite(); LA_traits<T>::clear(v, nn); };
+	void clear() { copyonwrite(true); LA_traits<T>::clear(v, nn); };
 
 	//! assignment operator assigns given vector
 	NRVec& operator=(const NRVec &rhs);
@@ -806,7 +806,7 @@ NRVec<T>::~NRVec() {
  * make own copy of the underlying data connected with this vector
  ******************************************************************************/
 template <typename T>
-void NRVec<T>::copyonwrite() {
+void NRVec<T>::copyonwrite(bool detachonly) {
 	if(!count) laerror("copyonwrite of an undefined vector");
 	if(*count > 1) {
 		(*count)--;
@@ -817,12 +817,12 @@ void NRVec<T>::copyonwrite() {
 		if(location == cpu){
 #endif
 			newv = new T[nn];
-			memcpy(newv, v, nn*sizeof(T));
+			if(!detachonly) memcpy(newv, v, nn*sizeof(T));
 #ifdef CUDALA
 		}else{
 			newv = (T *) gpualloc(nn*sizeof(T));
 			if(sizeof(T)%sizeof(float) != 0) laerror("memory alignment problem in NRVec<T>::copyonwrite()");
-			cublasScopy(nn*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
+			if(!detachonly) cublasScopy(nn*sizeof(T)/sizeof(float), (const float *) v, 1, (float *)newv, 1);
 			TEST_CUBLAS("cublasScopy");//"NRVec<T>::copyonwrite()"
 		}
 #endif