*** empty log message ***

2011-01-18 14:37:05 +00:00
parent 600b5b3abd
commit 4534c2e56a
21 changed files with 753 additions and 138 deletions
--- a/csrmat.cc
+++ b/csrmat.cc
@@ -30,6 +30,9 @@ namespace LA {
 /*
 Commented out by Roman for ICC
 #define INSTANTIZE(T) \
 template void CSRMat<T>::gemm(const T beta, const CSRMat &a, const char transa, const CSRMat &b, const char transb, const T alpha); \
 template CSRMat<T> & CSRMat<T>::operator*=(const T &a); \
@@ -45,8 +48,8 @@ template void CSRMat<T>::put(int fd, bool dimen, bool transp) const; \
 INSTANTIZE(double)
 INSTANTIZE(complex<double>) 
 */
 //// forced instantization of functions in the header in the corresponding object file
 template class CSRMat<double>;
--- a/csrmat.h
+++ b/csrmat.h
@@ -71,6 +71,7 @@ public:
 	CSRMat & operator=(const CSRMat &rhs);
 	void copyonwrite();
        void resize(const SPMatindex nn, const SPMatindex mm);
 	void dealloc(void) {resize(0,0);}
 	void moveto(GPUID destination);
        void clear();
 	~CSRMat();
@@ -128,5 +129,12 @@ public:
 	*/
 };
 template <typename T>
 std::ostream & operator<<(std::ostream &s, const CSRMat<T> &x);
 template <class T>
 std::istream& operator>>(std::istream  &s, CSRMat<T> &x);
 }//namespace
 #endif //_CSRMAT_H_
--- a/davidson.h
+++ b/davidson.h
@@ -33,26 +33,21 @@ namespace LA {
 //therefore the whole implementation must be a template in a header
 //Note that for efficiency in a direct CI case the diagonalof() should cache its result
 template <typename T, typename Matrix>
 extern void davidson(const Matrix &bigmat, NRVec<T> &eivals, NRVec<T> *eivecs, const char *eivecsfile, 
 		int nroots=1,  const bool verbose=0, const double eps=1e-6,
 	 	const bool incore=1, int maxit=100, const int maxkrylov = 500,
 		void (*initguess)(NRVec<T> &)=NULL);
 //@@@options: left eigenvectors by matrix transpose, overridesymmetric (for nrmat)
 //@@@small matrix gdiagonalize - shift complex roots up (option to gdiagonalize?)
 //@@@test gdiagonalize whether it sorts the roots and what for complex ones
 //@@@implement left eigenvectors for nonsymmetric case
 //Davidson algorithm: J. Comp. Phys. 17:817 (1975) 
 //@@@implement left eigenvectors for nonsymmetric case
 template <typename T, typename Matrix>
-void davidson(const Matrix &bigmat, NRVec<T> &eivals, NRVec<T> *eivecs, const char *eivecsfile,
+extern void davidson(const Matrix &bigmat, NRVec<T> &eivals, NRVec<T> *eivecs, const char *eivecsfile, 
-                int nroots,  const bool verbose, const double eps, 
+		int nroots=1,  const bool verbose=0, const double eps=1e-6,
-                const bool incore, int maxit, const int maxkrylov,
+	 	const bool incore=1, int maxit=100, const int maxkrylov = 500,
-		void (*initguess)(NRVec<T> &))
+		void (*initguess)(NRVec<T> &)=NULL)
 {
 bool flag=0;
 int n=bigmat.nrows();
@@ -87,7 +82,7 @@ smallH=0;
 //default guess based on lowest diagonal element of the matrix
-if(initguess) (*initguess)(vec1);
+if(initguess) initguess(vec1);
 else
 	{
 	const T *diagonal = bigmat.diagonalof(vec2,false,true);
--- a/fourindex.h
+++ b/fourindex.h
@@ -226,6 +226,7 @@ public:
 	inline matel4<I,T> *getlist() const {return list;}
 	inline I size() const {return nn;}
 	void resize(const I n);
 	void dealloc(void) {resize(0);}
 	void copyonwrite();
 	unsigned long length() const;
 	inline void add(const I i, const I j, const I k, const I l, const T elem) 
--- a/la.h
+++ b/la.h
@@ -42,6 +42,7 @@
 #include "smat.h"
 #include "sparsemat.h"
 #include "sparsesmat.h"
 #include "csrmat.h"
 #include "vec.h"
 using namespace LA;
--- a/la_traits.h
+++ b/la_traits.h
@@ -220,8 +220,30 @@ static inline normtype norm (const  complex<C> &x) {return std::abs(x);}
 static inline void axpy (complex<C> &s, const complex<C> &x, const complex<C> &c) {s+=x*c;}
 static inline void get(int fd, complex<C> &x, bool dimensions=0, bool transp=0) {if(sizeof(complex<C>)!=read(fd,&x,sizeof(complex<C>))) laerror("read error");}
 static inline void put(int fd, const complex<C> &x, bool dimensions=0, bool transp=0) {if(sizeof(complex<C>)!=write(fd,&x,sizeof(complex<C>))) laerror("write error");}
-static void multiget(size_t n,int fd, complex<C> *x, bool dimensions=0){ssize_t r=read(fd,x,n*sizeof(complex<C>)); if((ssize_t)(n*sizeof(complex<C>))!=r) {std::cout<<"read returned "<<r<<std::endl; laerror("read error");}}
+static void multiget(size_t n,int fd, complex<C> *x, bool dimensions=0)
-static void multiput(size_t n, int fd, const complex<C> *x, bool dimensions=0) {ssize_t r=write(fd,x,n*sizeof(complex<C>)); if((ssize_t)(n*sizeof(complex<C>))!=r) {std::cout<<"write returned "<<r<<std::endl; laerror("write error");}}
+	{
 	size_t total=0;
 	ssize_t r;
 	do{
 		r=read(fd,x+total,(n-total)*sizeof(complex<C>)); 
 		if(r<0 || r==0 && n!=0 ) {std::cout<<"read returned "<<r<<std::endl; laerror("read error");}
 		else total += r/sizeof(complex<C>);
 		if(r%sizeof(complex<C>)) laerror("read error 2");
 	  }
 	while(total < n);
 	}
 static void multiput(size_t n, int fd, const complex<C> *x, bool dimensions=0) 
 	{
 	size_t total=0;
 	ssize_t r;
 	do{
 		r=write(fd,x+total,(n-total)*sizeof(complex<C>)); 
 		if(r<0 || r==0 && n!=0 ) {std::cout<<"write returned "<<r<<std::endl; laerror("write error");}
 		else total += r/sizeof(complex<C>);
 		if(r%sizeof(complex<C>)) laerror("write error 2");
 	  }
 	while(total < n);
 	}
 static void copy(complex<C> *dest, complex<C> *src, unsigned int n) {memcpy(dest,src,n*sizeof(complex<C>));}
 static void clear(complex<C> *dest, unsigned int n) {memset(dest,0,n*sizeof(complex<C>));}
 static void copyonwrite(complex<C> &x) {};
@@ -232,6 +254,7 @@ static inline C realpart(const complex<C> &x) {return x.real();}
 static inline C imagpart(const complex<C> &x) {return x.imag();}
 };
 //non-complex scalars
 template<typename C>
 struct LA_traits_aux<C, scalar_true> {
@@ -248,8 +271,30 @@ static inline normtype norm (const  C &x) {return std::abs(x);}
 static inline void axpy (C &s, const C &x, const C &c) {s+=x*c;}
 static inline void put(int fd, const C &x, bool dimensions=0, bool transp=0) {if(sizeof(C)!=write(fd,&x,sizeof(C))) laerror("write error");}
 static inline void get(int fd, C &x, bool dimensions=0, bool transp=0) {if(sizeof(C)!=read(fd,&x,sizeof(C))) laerror("read error");}
-static void multiget(size_t n,int fd, C *x, bool dimensions=0){ssize_t r=read(fd,x,n*sizeof(C)); if((ssize_t)(n*sizeof(C))!=r) {std::cout<<"read returned "<<r<<std::endl; laerror("read error");}}
+static void multiget(size_t n,int fd, C *x, bool dimensions=0)
-static void multiput(size_t n, int fd, const C *x, bool dimensions=0) {ssize_t r=write(fd,x,n*sizeof(C)); if((ssize_t)(n*sizeof(C))!=r) {std::cout<<"write returned "<<r<<std::endl; laerror("write error");}}
+	{
 	size_t total=0;
 	ssize_t r;
 	do{
 		r=read(fd,x+total,(n-total)*sizeof(C)); 
 		if(r<0 || r==0 && n!=0 ) {std::cout<<"read returned "<<r<<std::endl; laerror("read error");}
 		else total += r/sizeof(C);
 		if(r%sizeof(C)) laerror("read error 2");
 	  }
 	while(total < n);
 	}
 static void multiput(size_t n, int fd, const C *x, bool dimensions=0) 
 	{
 	size_t total=0;
 	ssize_t r;
 	do{
 		r=write(fd,x+total,(n-total)*sizeof(C)); 
 		if(r<0 || r==0 && n!=0 ) {std::cout<<"write returned "<<r<<std::endl; laerror("write error");}
 		else total += r/sizeof(C);
 		if(r%sizeof(C)) laerror("write error 2");
 	  }
 	while(total < n);
 	}
 static void copy(C *dest, C *src, unsigned int n) {memcpy(dest,src,n*sizeof(C));}
 static void clear(C *dest, unsigned int n) {memset(dest,0,n*sizeof(C));}
 static void copyonwrite(C &x) {};
--- a/laerror.h
+++ b/laerror.h
@@ -30,9 +30,9 @@ class LAerror
 	};
 #ifdef __GNUG__
-	#define laerror(X) { LA::laerror2(X, __PRETTY_FUNCTION__); }
+	#define laerror(X) LA::laerror2(X, __PRETTY_FUNCTION__)
 #else
-	#define laerror(X) { LA::laerror2(X, __func__); }
+	#define laerror(X) LA::laerror2(X, __func__)
 #endif
 extern void laerror2(const char *, const char *);
--- a/mat.cc
+++ b/mat.cc
@@ -150,7 +150,7 @@ void NRMat<T>::put(int fd, bool dim, bool transp) const {
 			}
 		}
        }else{
-		LA_traits<T>::multiput(nn*mm,fd,
+		LA_traits<T>::multiput((size_t)nn*(size_t)mm,fd,
 		#ifdef MATPTR
 		        v[0]
 		#else
@@ -202,7 +202,7 @@ void NRMat<T>::get(int fd, bool dim, bool transp){
 			}
 		}
 	}else{
-		LA_traits<T>::multiget(nn*mm,fd,
+		LA_traits<T>::multiget((size_t)nn*(size_t)mm,fd,
 		#ifdef MATPTR
 		        v[0]
 		#else
@@ -838,8 +838,9 @@ NRMat<T>& NRMat<T>::transposeme(const int _n) {
 	return *this;
 }
 /***************************************************************************//**
- * icreate complex double-precision matrix from real double-precision matrix \f$A\f$
+ * create complex double-precision matrix from real double-precision matrix \f$A\f$
 * @param[in] rhs real double-precision matrix \f$A\f$
 * @param[in] imagpart flag indicating whether the matrix \f$A\f$ should be considered as a real
 *  or imaginary part of the complex matrix being created
@@ -877,6 +878,43 @@ NRMat<complex<double> >::NRMat(const NRMat<double> &rhs, bool imagpart): nn(rhs.
 #endif
 }
 /***************************************************************************//**
 * create double-precision matrix from complex double-precision matrix \f$A\f$
 * @param[in] rhs complex double-precision matrix \f$A\f$
 * @param[in] imagpart flag indicating whether the matrix \f$A\f$ should be taken as the real
 *  or imaginary part of the input complex matrix 
 ******************************************************************************/
 template<>
 NRMat<double>::NRMat(const NRMat<complex<double> > &rhs, bool imagpart): nn(rhs.nrows()), mm(rhs.ncols()), count(new int(1)) {
 	const int nn_mm = nn*mm;
 #ifdef CUDALA
 	if(location == cpu){
 #endif
 	#ifdef MATPTR
 	        v = new double*[n];
 	        v[0] = new double[nn_mm];
 	        for(register int i=1; i<n; i++) v[i] = v[i-1] + m;
 	        cblas_dcopy(nn_mm, ((double *)&rhs[0][0]) + (imagpart?1:0), 2, v[0], 1);
 	#else
 	        v = new double[nn_mm];
 		cblas_dcopy(nn_mm, ((double *) &rhs[0][0]) + (imagpart?1:0), 2, v , 1);
 	#endif
 #ifdef CUDALA
 	}else{
 		v = (double *)gpualloc(sizeof(double)*nn_mm);
 		cublasDcopy(nn_mm, ((double*)&rhs[0][0])+ (imagpart?1:0), 2, v , 1);
 		TEST_CUBLAS("cublasDcopy");
 	}
 #endif
 }
 /***************************************************************************//**
 * output of a matrix of general type via lawritemat
 ******************************************************************************/
@@ -1156,8 +1194,9 @@ void NRMat<complex<double> >::randomize(const double &x) {
 #endif
 		for(register int i=0; i<nn; ++i){
 			for(register int j=0; j<mm; ++j){
-				(*this)(i,j).real() = x*(2.*random()/(1. + RAND_MAX) - 1.);
+				const double re = x*(2.*random()/(1. + RAND_MAX) - 1.);
-				(*this)(i,j).imag() = x*(2.*random()/(1. + RAND_MAX) - 1.);
+				const double im = x*(2.*random()/(1. + RAND_MAX) - 1.);
 				(*this)(i,j) = complex<double>(re, im);
 			}
 		}
 #ifdef CUDALA
--- a/mat.h
+++ b/mat.h
@@ -80,6 +80,8 @@ public:
 	//! complexifying constructor 
 	NRMat(const typename LA_traits_complex<T>::NRMat_Noncomplex_type &rhs, bool imagpart = false);
 	//! explicit decomplexifying constructor
 	explicit NRMat(const NRMat<complex<T> > &rhs, bool imagpart = false);
 	//! explicit constructor converting symmetric matrix stored in packed form into a <code>NRMat<T></code> object
 	explicit NRMat(const NRSMat<T> &rhs);
@@ -280,6 +282,9 @@ public:
 	//! resize the matrix
 	void resize(int n, int m);
 	//! deallocate the matrix
 	void dealloc(void) {resize(0,0);}
 	//! get the pointer to the data
 	inline operator T*();
 	//! get the const pointer to the data
@@ -332,6 +337,8 @@ public:
 	explicit NRMat(const SparseMat<T> &rhs);                // dense from sparse
 	//! explicit constructor converting sparse symmetric matrix into \c NRMat<T> object
 	explicit NRMat(const SparseSMat<T> &rhs);
 	//! explicit constructor converting sparse CSR matrix into \c NRMat<T> object
        explicit NRMat(const CSRMat<T> &rhs);
 	//! add up given sparse matrix
 	NRMat & operator+=(const SparseMat<T> &rhs);
@@ -618,7 +625,6 @@ inline T* NRMat<T>::operator[](const int i) {
 	if (i < 0 || i >= nn) laerror("Mat [] out of range");
 	if (!v) laerror("unallocated matrix");
 #endif
 	NOT_GPU(*this);
 	#ifdef MATPTR
 		return v[i];
 	#else
--- a/matexp.h
+++ b/matexp.h
@@ -42,8 +42,8 @@ else
 	for(i=order-1; i>=0; i--)
 		{
 		//std::cerr<<"TEST polynom0 "<<i<<'\n';
-		if(i<order-1) z=y*x;
+		if(i<order-1) {LA_traits<T>::deallocate(z); z=y*x;} //for large matrices avoid storing 4 ones simultaneously
-		y=z+c[i];
+		LA_traits<T>::deallocate(y); y=z+c[i];
 		}
 	}
@@ -346,9 +346,11 @@ int power;
 NRVec<typename LA_traits<V>::normtype> taylor2=exp_aux<M,typename LA_traits<V>::normtype>(mat,power,maxpower,maxtaylor,scale);
 V tmp;
 bool washere=0;
 for(int i=1; i<=(1<<power); ++i) //unfortunatelly, here we have to repeat it many times, unlike if the matrix is stored explicitly
 	{
 	washere=1;
 	if(i>1) rhs=result; //apply again to the result of previous application
 	else result=rhs;
 	tmp=rhs; //now rhs can be used as scratch	
@@ -361,6 +363,8 @@ for(int i=1; i<=(1<<power); ++i) //unfortunatelly, here we have to repeat it man
 		}
 	}
 if(!washere) laerror("integer overflow due to unrealistically big power - use maxpower argument in exptimes()");
 return;
 }
--- a/nonclass.cc
+++ b/nonclass.cc
@@ -240,6 +240,41 @@ linear_solve_do(a,&B[0],1,a.nrows(),det,n);
 }
 // Roman, complex version of linear_solve()
 extern "C" void FORNAME(zgesv)(const int *N, const int *NRHS, double *A, const int *LDA,
           int *IPIV, double *B, const int *LDB, int *INFO);
 void linear_solve(NRMat< complex<double> > &A, NRMat< complex<double> > *B, complex<double> *det, int n)
 {
        int r, *ipiv;
        if (A.nrows() != A.ncols()) laerror("linear_solve() call for non-square matrix");
        if (B && A.nrows() != B->ncols()) laerror("incompatible matrices in linear_solve()");
        A.copyonwrite();
        if (B) B->copyonwrite();
        ipiv = new int[A.nrows()];
  n = A.nrows();
  int nrhs = B ? B->nrows() : 0;
  int lda = A.ncols();
  int ldb = B ? B->ncols() : A.nrows();
        FORNAME(zgesv)(&n, &nrhs, (double *)A[0], &lda, ipiv,
                 B ? (double *)(*B)[0] : (double *)0, &ldb, &r);
        if (r < 0) {
                delete[] ipiv;
                laerror("illegal argument in lapack_gesv");
        }
        if (det && r>=0) {
                *det = A[0][0];
                for (int i=1; i<A.nrows(); ++i) *det *= A[i][i];
                //change sign of det by parity of ipiv permutation
                for (int i=0; i<A.nrows(); ++i) *det = -(*det);
        }
        delete [] ipiv;
        if (r>0 && B) laerror("singular matrix in zgesv");
 }
 //other version of linear solver based on gesvx
 //------------------------------------------------------------------------------
@@ -793,6 +828,18 @@ extern "C" void FORNAME(dggev)(const char *JOBVL, const char *JOBVR, const FINT
 		 double *VL, const FINT *LDVL,  double *VR, const FINT *LDVR,  
 		double *WORK, const FINT *LWORK, FINT *INFO );
 extern "C" void FORNAME(zgeev)(const char *JOBVL, const char *JOBVR, const FINT *N,
                complex<double> *A, const FINT *LDA, complex<double> *W, complex<double> *VL, const FINT *LDVL,
                complex<double> *VR, const FINT *LDVR, complex<double> *WORK, const FINT *LWORK,
 		 double *RWORK, FINT *INFO );
 extern "C" void FORNAME(zggev)(const char *JOBVL, const char *JOBVR, const FINT *N,
                complex<double> *A, const FINT *LDA, complex<double> *B, const FINT *LDB, complex<double> *W, complex<double> *WBETA,
                complex<double> *VL, const FINT *LDVL,  complex<double> *VR, const FINT *LDVR,
                complex<double> *WORK, const FINT *LWORK, double *RWORK, FINT *INFO );
 //statics for sorting
 static int *gdperm;
@@ -904,11 +951,12 @@ void gdiagonalize(NRMat<double> &a, NRVec<double> &wr, NRVec<double> &wi,
 #endif
 	delete[] work;
 //std::cout <<"TEST dgeev\n"<<wr<<wi<<*vr<<*vl<<std::endl;
 	if (r < 0) laerror("illegal argument in ggev/geev in gdiagonalize()");
 	if (r > 0) laerror("convergence problem in ggev/geev in gdiagonalize()");
 //std::cout <<"TEST dgeev\n"<<wr<<wi<<*vr<<*vl<<std::endl;
 	if(biorthonormalize && vl && vr)
 		{
 		if(b || beta) laerror("@@@ biorthonormalize not implemented yet for generalized non-symmetric eigenproblem");//metric b would be needed
@@ -968,6 +1016,7 @@ void gdiagonalize(NRMat<double> &a, NRVec<double> &wr, NRVec<double> &wi,
 			}
 		}
 	if(sorttype>0)
 		{
 		NRVec<int> perm(n);
@@ -997,12 +1046,119 @@ void gdiagonalize(NRMat<double> &a, NRVec<double> &wr, NRVec<double> &wi,
 }
 //most general complex routine
 template<>
 void gdiagonalize(NRMat<complex<double> > &a, NRVec< complex<double> > &w,
 		NRMat< complex<double> >*vl, NRMat< complex<double> > *vr,
 		const bool corder, int n, const int sorttype, const int biorthonormalize,
 		NRMat<complex<double> > *b, NRVec<complex<double> > *beta)
 {
 	if(n<=0) n = a.nrows();
 	if (n > a.ncols() || n>a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");
 	if (n > w.size()) 
 		laerror("inconsistent dimension of eigen vector in gdiagonalize()");
 	if (vl) if (n > vl->nrows() || n > vl->ncols())
 		laerror("inconsistent dimension of vl in gdiagonalize()");
 	if (vr) if (n > vr->nrows() || n > vr->ncols())
 		laerror("inconsistent dimension of vr in gdiagonalize()");
 	if (beta) if(n > beta ->size()) laerror("inconsistent dimension of beta in gdiagonalize()");
 	if(b) if(n > b->nrows() || n > b->ncols())
 		 laerror("inconsistent dimension of b in gdiagonalize()");
 	if(b && !beta || beta && !b) laerror("missing array for generalized diagonalization");
 	a.copyonwrite();
 	w.copyonwrite();
 	if (vl) vl->copyonwrite();
 	if (vr) vr->copyonwrite();
 	if (beta) beta->copyonwrite();
 	if (b) b->copyonwrite();
 	char jobvl = vl ? 'V' : 'N';
 	char jobvr = vr ? 'V' : 'N';
 	complex<double> work0;
 	FINT lwork = -1;
 	FINT r;
 	FINT lda=a.ncols();
 	FINT ldb=0;
 	if(b) ldb=b->ncols();
 	FINT ldvl= vl?vl->ncols():lda;
 	FINT ldvr= vr?vr->ncols():lda;
 	double *rwork = new double[n*(b?8:2)];
 #ifdef FORINT
        FINT ntmp = n; 
        if(b)  FORNAME(zggev)(&jobvr, &jobvl, &ntmp, a, &lda, *b, &ldb, w, *beta, vr?vr[0]:(complex<double> *)0,
                        &ldvr, vl?vl[0]:(complex<double> *)0, &ldvl, &work0, &lwork, rwork, &r);
 	else FORNAME(zgeev)(&jobvr, &jobvl, &ntmp, a, &lda, w, vr?vr[0]:(complex<double> *)0,
 			&ldvr, vl?vl[0]:(complex<double> *)0, &ldvl, &work0, &lwork, rwork, &r);
 #else
        if(b)  FORNAME(zggev)(&jobvr, &jobvl, &n, a, &lda, *b, &ldb, w, *beta, vr?vr[0]:(complex<double> *)0,
                        &ldvr, vl?vl[0]:(complex<double> *)0, &ldvl, &work0, &lwork, rwork, &r);
 	else FORNAME(zgeev)(&jobvr, &jobvl, &n, a, &lda, w, vr?vr[0]:(complex<double> *)0,
 			&ldvr, vl?vl[0]:(complex<double> *)0, &ldvl, &work0, &lwork, rwork, &r);
 #endif
        lwork = (FINT) work0.real();
 	complex<double> *work = new complex<double>[lwork];
 #ifdef FORINT
        if(b)  FORNAME(zggev)(&jobvr, &jobvl, &ntmp, a, &lda, *b, &ldb, w, *beta, vr?vr[0]:(complex<double> *)0,
                        &ldvr, vl?vl[0]:(complex<double> *)0, &ldvl, work, &lwork, rwork, &r);
 	else FORNAME(zgeev)(&jobvr, &jobvl, &ntmp, a, &lda, w, vr?vr[0]:(complex<double> *)0,
 			&ldvr, vl?vl[0]:(complex<double> *)0, &ldvl, work, &lwork, rwork, &r);
 #else
        if(b)  FORNAME(zggev)(&jobvr, &jobvl, &n, a, &lda, *b, &ldb, w, *beta, vr?vr[0]:(complex<double> *)0,
                        &ldvr, vl?vl[0]:(complex<double> *)0, &ldvl, work, &lwork, rwork, &r);
 	else FORNAME(zgeev)(&jobvr, &jobvl, &n, a, &lda, w, vr?vr[0]:(complex<double> *)0,
 			&ldvr, vl?vl[0]:(complex<double> *)0, &ldvl, work, &lwork, rwork, &r);
 #endif
 	delete[] work;
 	delete[] rwork;
 //std::cout <<"TEST zg(g|e)ev\n"<<w<<*vr<<*vl<<std::endl;
 	if (r < 0) laerror("illegal argument in ggev/geev in gdiagonalize()");
 	if (r > 0) laerror("convergence problem in ggev/geev in gdiagonalize()");
 	if(biorthonormalize && vl && vr)
 		{
 		if(b || beta) laerror("@@@ biorthonormalize not implemented yet for generalized non-hermitian eigenproblem");//metric b would be needed
 		for(int i=0; i<n; ++i)
 			{
 			//calculate scaling paramter
 			complex<double> tmp;
 			cblas_zdotc_sub(n,(*vr)[i],1,(*vl)[i], 1, &tmp);
 			tmp = 1./tmp;
 			std::cout <<"scaling by "<<tmp<<"\n";
 			if(biorthonormalize==1) cblas_zscal(n,&tmp,(*vl)[i],1);
 			if(biorthonormalize==2) cblas_zscal(n,&tmp,(*vr)[i],1);
 			}
 		}
 	if(sorttype>0)
 		{
 		laerror("sorting not implemented in complex gdiagonalize");
 		}
 	if (corder) {
 		if (vl) vl->transposeme(n);
 		if (vr) vr->transposeme(n);
 	}
 }
 template<>
 void gdiagonalize(NRMat<double> &a, NRVec< complex<double> > &w,
 		NRMat< complex<double> >*vl, NRMat< complex<double> > *vr,
 		const bool corder, int n, const int sorttype, const int biorthonormalize,
 		NRMat<double> *b, NRVec<double> *beta)
 {
 	if(!corder)  laerror("gdiagonalize() corder 0 not implemented");
 	if(n<=0)  n = a.nrows();
 	if(n> a.nrows() || n ==  a.nrows() && n != a.ncols()) laerror("gdiagonalize() call for a non-square matrix");
@@ -1020,20 +1176,44 @@ void gdiagonalize(NRMat<double> &a, NRVec< complex<double> > &w,
 		i = 0;
 		while (i < n) {
 			if (wi[i] == 0) {
 			    if(corder)
 				{
                                if (vl) for (int j=0; j<n; j++) (*vl)[j][i] = (*rvl)[i][j];
                                if (vr) for (int j=0; j<n; j++) (*vr)[j][i] = (*rvr)[i][j];
 				}
 			    else
 				{
 				if (vl) for (int j=0; j<n; j++) (*vl)[i][j] = (*rvl)[i][j];
 				if (vr) for (int j=0; j<n; j++) (*vr)[i][j] = (*rvr)[i][j];
 				}
 				i++;
 			} else {
 				if (vl)
 					for (int j=0; j<n; j++) {
 					    if(corder)
 						{
                                                (*vl)[j][i] = complex<double>((*rvl)[i][j], (*rvl)[i+1][j]);
                                                (*vl)[j][i+1] = complex<double>((*rvl)[i][j], -(*rvl)[i+1][j]);
 						}
 						else
 						{
 						(*vl)[i][j] = complex<double>((*rvl)[i][j], (*rvl)[i+1][j]);
 						(*vl)[i+1][j] = complex<double>((*rvl)[i][j], -(*rvl)[i+1][j]);
 						}
 					} 
 				if (vr)
 					for (int j=0; j<n; j++) {
 					    if(corder)
                                                {
                                                (*vr)[j][i] = complex<double>((*rvr)[i][j], (*rvr)[i+1][j]);
                                                (*vr)[j][i+1] = complex<double>((*rvr)[i][j], -(*rvr)[i+1][j]);
                                                }
                                                else
                                                {
 						(*vr)[i][j] = complex<double>((*rvr)[i][j], (*rvr)[i+1][j]);
 						(*vr)[i+1][j] = complex<double>((*rvr)[i][j], -(*rvr)[i+1][j]);
 						}
 					}
 				i += 2;
 			}
 		}
@@ -1043,35 +1223,78 @@ void gdiagonalize(NRMat<double> &a, NRVec< complex<double> > &w,
 }
-const NRMat<double> realpart(const NRMat< complex<double> > &a)
+template<>
 const NRMat<double> realpart<NRMat< complex<double> > >(const NRMat< complex<double> > &a)
 {
 #ifdef CUDALA
        if(location == cpu){
 #endif
 	NRMat<double> result(a.nrows(), a.ncols());
 	cblas_dcopy(a.nrows()*a.ncols(), (const double *)a[0], 2, result, 1);
 #ifdef CUDALA
        }else{
 	laerror("not implemented for cuda yet");
 	}
 #endif
 	return result;
 }
-const NRMat<double> imagpart(const NRMat< complex<double> > &a)
+template<>
 const NRMat<double> imagpart<NRMat< complex<double> > >(const NRMat< complex<double> > &a)
 {
 #ifdef CUDALA
        if(location == cpu){
 #endif
 	NRMat<double> result(a.nrows(), a.ncols());
 	cblas_dcopy(a.nrows()*a.ncols(), (const double *)a[0]+1, 2, result, 1);
 #ifdef CUDALA
        }else{
        laerror("not implemented for cuda yet");
        }
 #endif
 	return result;
 }
-const NRMat< complex<double> > realmatrix (const NRMat<double> &a)
+template<>
 const NRMat< complex<double> > realmatrix<NRMat<double> > (const NRMat<double> &a)
 {
 #ifdef CUDALA
        if(location == cpu){
 #endif
 	NRMat <complex<double> > result(a.nrows(), a.ncols());
 	cblas_dcopy(a.nrows()*a.ncols(), a, 1, (double *)result[0], 2);
 #ifdef CUDALA
        }else{
        laerror("not implemented for cuda yet");
        }
 #endif
 	return result;
 }
-const NRMat< complex<double> > imagmatrix (const NRMat<double> &a)
+template<>
 const NRMat< complex<double> > imagmatrix<NRMat<double> > (const NRMat<double> &a)
 {
 #ifdef CUDALA
        if(location == cpu){
 #endif
 	NRMat< complex<double> > result(a.nrows(), a.ncols());
 	cblas_dcopy(a.nrows()*a.ncols(), a, 1, (double *)result[0]+1, 2);
 #ifdef CUDALA
        }else{
        laerror("not implemented for cuda yet");
        }
 #endif
 	return result;
 }
-const NRMat< complex<double> > complexmatrix (const NRMat<double> &re, const NRMat<double> &im)
+template<>
 const NRMat< complex<double> > complexmatrix<NRMat<double> > (const NRMat<double> &re, const NRMat<double> &im)
 {
 	if(re.nrows()!=im.nrows() || re.ncols() != im.ncols()) laerror("incompatible sizes of real and imaginary parts");
        NRMat< complex<double> > result(re.nrows(), re.ncols());
@@ -1080,57 +1303,60 @@ const NRMat< complex<double> > complexmatrix (const NRMat<double> &re, const NRM
        return result;
 }
 template<>
 const SparseSMat< complex<double> > complexmatrix<SparseSMat<double> >(const SparseSMat<double> &re, const SparseSMat<double> &im) {
  if(re.nrows()!=im.nrows() || re.ncols() != im.ncols()) laerror("incompatible sizes of real and imaginary parts");
  SparseSMat< complex<double> > result(re.nrows(),re.ncols());
  complex<double> tmp;
-
+  SparseSMat<double>::iterator pre(re);
-NRMat<double> matrixfunction(NRMat<double> a, complex<double>
+  for(; pre.notend(); ++pre) {
-		(*f)(const complex<double> &), const bool adjust)
+    tmp = pre->elem;
-{
+    result.add(pre->row,pre->col,tmp,false);
 	int n = a.nrows();
 	NRMat< complex<double> > u(n, n), v(n, n);
 	NRVec< complex<double> > w(n);
 /*
 NRMat<complex<double> > a0=complexify(a);
 */
 	gdiagonalize(a, w, &u, &v);//a gets destroyed, eigenvectors are rows
 	NRVec< complex<double> > z = diagofproduct(u, v, 1, 1);
 /*
 std::cout <<"TEST matrixfunction\n"<<w<<u<<v<<z;
 std::cout <<"TEST matrixfunction1 "<< u*a0 - diagonalmatrix(w)*u<<std::endl;
 std::cout <<"TEST matrixfunction2 "<< a0*v.transpose(1) - v.transpose(1)*diagonalmatrix(w)<<std::endl;
 std::cout <<"TEST matrixfunction3 "<< u*v.transpose(1)<<diagonalmatrix(z)<<std::endl;
 NRVec< complex<double> > wz(n);
 for (int i=0; i<a.nrows(); i++) wz[i] = w[i]/z[i];
 std::cout <<"TEST matrixfunction4 "<< a0<< v.transpose(true)*diagonalmatrix(wz)*u<<std::endl;
 */
 	for (int i=0; i<a.nrows(); i++) w[i] = (*f)(w[i])/z[i];
 	u.diagmultl(w);
 	NRMat< complex<double> > r(n, n);
 	r.gemm(0.0, v, 'c', u, 'n', 1.0);
 	double inorm = cblas_dnrm2(n*n, (double *)r[0]+1, 2);
 	if (inorm > 1e-10) {
 		std::cout << "norm = " << inorm << std::endl;
 		laerror("nonzero norm of imaginary part of real matrixfunction");
  }
-	return realpart(r);
+
  SparseSMat<double>::iterator pim(im);
  for(; pim.notend(); ++pim) {
    tmp = complex<double>(0,1)*(pim->elem);
    result.add(pim->row,pim->col,tmp,false);
  }
  return result;
 }
-NRMat<double> matrixfunction(NRSMat<double> a, double (*f) (double))
+template<>
-{
+const SparseSMat< complex<double> > realmatrix<SparseSMat<double> >(const SparseSMat<double> &re) {
-	int n = a.nrows();
+  SparseSMat< complex<double> > result(re.nrows(),re.ncols());
-	NRVec<double> w(n);
+  complex<double> tmp;
 	NRMat<double> v(n, n);
 	diagonalize(a, w, &v, 0);
-	for (int i=0; i<a.nrows(); i++) w[i] = (*f)(w[i]);
+  SparseSMat<double>::iterator pre(re);
-	NRMat<double> u = v;
+  for(; pre.notend(); ++pre) {
-	v.diagmultl(w);
+    tmp = pre->elem;
-	NRMat<double> r(n, n);
+    result.add(pre->row,pre->col,tmp,false);
-	r.gemm(0.0, u, 't', v, 'n', 1.0);
+  }
-	return r;
+
  return result;
 }
 template<>
 const SparseSMat< complex<double> > imagmatrix<SparseSMat<double> >(const SparseSMat<double> &im) {
  SparseSMat< complex<double> > result(im.nrows(),im.ncols());
  complex<double> tmp;
  SparseSMat<double>::iterator pim(im);
  for(; pim.notend(); ++pim) {
    tmp = complex<double>(0,1)*(pim->elem);
    result.add(pim->row,pim->col,tmp,false);
  }
  return result;
 }
 NRMat<double> realmatrixfunction(NRMat<double> a, double (*f) (const double))
 {
        int n = a.nrows();
@@ -1145,6 +1371,7 @@ NRMat<double> realmatrixfunction(NRMat<double> a, double (*f) (const double))
        return r;
 }
 NRMat<complex<double> > complexmatrixfunction(NRMat<double> a, double (*fre) (const double), double (*fim) (const double))
 {
        int n = a.nrows();
@@ -1169,6 +1396,16 @@ NRMat<complex<double> > complexmatrixfunction(NRMat<double> a, double (*fre) (co
 // instantize template to an addresable function
 complex<double> myccopy (const complex<double> &x) 
 {
 	return x;
 }
 double mycopy (const double x) 
 {
 	return x;
 }
 complex<double> myclog (const complex<double> &x) 
 {
 	return log(x);
@@ -1193,14 +1430,37 @@ double sqrtinv (const double x)
 NRMat<double>  log(const NRMat<double> &a)
 {
-	return matrixfunction(a, &myclog, 1);
+	return matrixfunction(a, &myclog);
 }
 NRMat<complex<double> >  log(const NRMat<complex<double> > &a)
 {
        return matrixfunction(a, &myclog);
 }
 NRMat<double>  exp0(const NRMat<double> &a)
 {
-        return matrixfunction(a, &mycexp, 1);
+        return matrixfunction(a, &mycexp);
 }
 NRMat<complex<double> >  exp0(const NRMat<complex<double> > &a)
 {
        return matrixfunction(a, &mycexp);
 }
 NRMat<complex<double> >  copytest(const NRMat<complex<double> > &a)
 {
        return matrixfunction(a, &myccopy);
 }
 NRMat<double>  copytest(const NRMat<double> &a)
 {
        return matrixfunction(a, &myccopy);
 }
 const NRVec<double> diagofproduct(const NRMat<double> &a, const NRMat<double> &b,
--- a/nonclass.h
+++ b/nonclass.h
@@ -88,8 +88,8 @@ extern const  NRVec<T> diagofproduct(const NRMat<T> &a, const NRMat<T> &b,\
 extern T trace2(const NRMat<T> &a, const NRMat<T> &b, bool trb=0); \
 extern T trace2(const NRSMat<T> &a, const NRSMat<T> &b, const bool diagscaled=0);\
 extern T trace2(const NRSMat<T> &a, const NRMat<T> &b, const bool diagscaled=0);\
-extern void linear_solve(NRMat<T> &a, NRMat<T> *b, double *det=0,int n=0); /*solve Ax^T=b^T (b is nrhs x n) */ \
+extern void linear_solve(NRMat<T> &a, NRMat<T> *b, T *det=0,int n=0); /*solve Ax^T=b^T (b is nrhs x n) */ \
-extern void linear_solve(NRSMat<T> &a, NRMat<T> *b, double *det=0, int n=0); /*solve Ax^T=b^T (b is nrhs x n) */\
+extern void linear_solve(NRSMat<T> &a, NRMat<T> *b, T *det=0, int n=0); /*solve Ax^T=b^T (b is nrhs x n) */\
 extern void linear_solve(NRMat<T> &a, NRVec<T> &b, double *det=0, int n=0); \
 extern void linear_solve(NRSMat<T> &a, NRVec<T> &b, double *det=0, int n=0); \
 extern void diagonalize(NRMat<T> &a, NRVec<LA_traits<T>::normtype> &w, const bool eivec=1, const bool corder=1, int n=0, NRMat<T> *b=NULL, const int itype=1); \
@@ -104,36 +104,28 @@ declare_la(complex<double>)
 // Separate declarations
 //general nonsymmetric matrix and generalized diagonalization
 //corder =0 ... C rows are eigenvectors, =1 ... C columns are eigenvectors
 extern void gdiagonalize(NRMat<double> &a, NRVec<double> &wr, NRVec<double> &wi,
 		NRMat<double> *vl, NRMat<double> *vr, const bool corder=1, int n=0, const int sorttype=0, const int biorthonormalize=0,
-		NRMat<double> *b=NULL, NRVec<double> *beta=NULL);
+		NRMat<double> *b=NULL, NRVec<double> *beta=NULL); //this used real storage of eigenvectors like dgeev
-extern void gdiagonalize(NRMat<double> &a, NRVec< complex<double> > &w,
+
 template<typename T>
 extern void gdiagonalize(NRMat<T> &a, NRVec< complex<double> > &w,
 		 NRMat< complex<double> >*vl, NRMat< complex<double> > *vr,
 		 const bool corder=1, int n=0, const int sorttype=0, const int biorthonormalize=0,
-		NRMat<double> *b=NULL, NRVec<double> *beta=NULL);
+		NRMat<T> *b=NULL, NRVec<T> *beta=NULL); //eigenvectors are stored in complex matrices for T both double and complex
 extern NRMat<double> matrixfunction(NRSMat<double> a, double (*f) (double));
 extern NRMat<double> realmatrixfunction(NRMat<double> a, double (*f) (double)); //a has to by in fact symmetric
 extern NRMat<complex<double> > complexmatrixfunction(NRMat<double> a, double (*fre) (double), double (*fim) (double)); //a has to by in fact symmetric
 extern NRMat<double> matrixfunction(NRMat<double> a, complex<double> (*f)(const complex<double> &),const bool adjust=0);
-extern complex<double> sqrtinv(const complex<double> &);
+//complex,real,imaginary parts of various entities
-extern double sqrtinv(const double);
+template<typename T>
-
+extern const typename LA_traits<T>::realtype realpart(const T&);
-//functions on matrices
+template<typename T>
-inline NRMat<double>  sqrt(const NRSMat<double> &a) { return matrixfunction(a,&std::sqrt); }
+extern const typename LA_traits<T>::realtype imagpart(const T&);
-inline NRMat<double>  sqrtinv(const NRSMat<double> &a) { return matrixfunction(a,&sqrtinv); }
+template<typename T>
-inline NRMat<double>  realsqrt(const NRMat<double> &a) { return realmatrixfunction(a,&std::sqrt); }
+extern const typename LA_traits<T>::complextype realmatrix (const T&);
-inline NRMat<double>  realsqrtinv(const NRMat<double> &a) { return realmatrixfunction(a,&sqrtinv); }
+template<typename T>
-inline NRMat<double>  log(const NRSMat<double> &a) { return matrixfunction(a,&std::log); }
+extern const typename LA_traits<T>::complextype imagmatrix (const T&);
-extern NRMat<double> log(const NRMat<double> &a);
+template<typename T>
-extern NRMat<double> exp0(const NRMat<double> &a);
+extern const typename LA_traits<T>::complextype complexmatrix (const T&, const T&);
 extern const NRMat<double> realpart(const NRMat< complex<double> >&);
 extern const NRMat<double> imagpart(const NRMat< complex<double> >&);
 extern const NRMat< complex<double> > realmatrix (const NRMat<double>&);
 extern const NRMat< complex<double> > imagmatrix (const NRMat<double>&);
 extern const NRMat< complex<double> > complexmatrix (const NRMat<double>&, const NRMat<double>&);
 //Cholesky decomposition
 extern void cholesky(NRMat<double> &a, bool upper=1);
@@ -315,5 +307,84 @@ return r;
 }
 //matrix functions via diagonalization
 extern NRMat<double> realmatrixfunction(NRMat<double> a, double (*f) (double)); //a has to by in fact symmetric
 extern NRMat<complex<double> > complexmatrixfunction(NRMat<double> a, double (*fre) (double), double (*fim) (double)); //a has to by in fact symmetric
 template<typename T> 
 NRMat<T> matrixfunction(NRSMat<T> a, double (*f) (double)) //of symmetric/hermitian matrix
 {
 	int n = a.nrows();
 	NRVec<double> w(n);
 	NRMat<T> v(n, n);
 	diagonalize(a, w, &v, 0);
 	for (int i=0; i<a.nrows(); i++) w[i] = (*f)(w[i]);
 	NRMat<T> u = v;
 	NRVec<T> ww=w; //diagmultl needs same type
 	v.diagmultl(ww);
 	NRMat<T> r(n, n);
 	r.gemm(0.0, u, 't', v, 'n', 1.0); //gemm will use 'c' for complex ones 
 	return r;
 }
 template<typename T>
 extern NRMat<T> matrixfunction(NRMat<T> a, complex<double> (*f)(const complex<double> &)) //of a general real/complex matrix
 {
 	int n = a.nrows();
 	NRVec<complex<double> > w(n);
 	NRMat<complex<double> > u(n,n),v(n,n);
 #ifdef debugmf
 NRMat<complex<double> > a0=a;
 #endif
        gdiagonalize<T>(a, w, &u, &v, false,n,0,false,NULL,NULL);//a gets destroyed, eigenvectors are rows
        NRVec< complex<double> > z = diagofproduct(u, v, 1, 1);
 #ifdef debugmf
 std::cout <<"TEST matrixfunction\n"<<w<<u<<v<<z;
 std::cout <<"TEST matrixfunction1 "<< u*a0 - diagonalmatrix(w)*u<<std::endl;
 std::cout <<"TEST matrixfunction2 "<< a0*v.transpose(1) - v.transpose(1)*diagonalmatrix(w)<<std::endl;
 std::cout <<"TEST matrixfunction3 "<< u*v.transpose(1)<<diagonalmatrix(z)<<std::endl;
 #endif
 NRVec< complex<double> > wz(n);
 for (int i=0; i<a.nrows(); i++) wz[i] = w[i]/z[i];
 #ifdef debugmf
 std::cout <<"TEST matrixfunction4 "<< a0<< v.transpose(true)*diagonalmatrix(wz)*u<<std::endl;
 #endif
        for (int i=0; i<a.nrows(); i++) w[i] = (*f)(w[i])/z[i];
        u.diagmultl(w);
        NRMat< complex<double> > r(n, n);
        r.gemm(0.0, v, 'c', u, 'n', 1.0);
 	return (NRMat<T>) r; //convert back to real if applicable by the explicit decomplexifying constructor; it is NOT checked to which accuracy the imaginary part is actually zero
 }
 extern complex<double> sqrtinv(const complex<double> &);
 extern double sqrtinv(const double);
 //functions on matrices
 inline NRMat<double>  sqrt(const NRSMat<double> &a) { return matrixfunction(a,&std::sqrt); }
 inline NRMat<double>  sqrtinv(const NRSMat<double> &a) { return matrixfunction(a,&sqrtinv); }
 inline NRMat<double>  realsqrt(const NRMat<double> &a) { return realmatrixfunction(a,&std::sqrt); }
 inline NRMat<double>  realsqrtinv(const NRMat<double> &a) { return realmatrixfunction(a,&sqrtinv); }
 inline NRMat<double>  log(const NRSMat<double> &a) { return matrixfunction(a,&std::log); }
 extern NRMat<double> log(const NRMat<double> &a);
 extern NRMat<complex<double> > log(const NRMat<complex<double> > &a);
 extern NRMat<complex<double> > exp0(const NRMat<complex<double> > &a);
 extern NRMat<complex<double> > copytest(const NRMat<complex<double> > &a);
 extern NRMat<double> copytest(const NRMat<double> &a);
 extern NRMat<double> exp0(const NRMat<double> &a);
 }//namespace
 #endif
--- a/smat.cc
+++ b/smat.cc
@@ -58,7 +58,7 @@ void NRSMat<T>::put(int fd, bool dim, bool transp) const {
 		if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write");
 		if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write");
 	}
-	LA_traits<T>::multiput(NN2,fd,v,dim);
+	LA_traits<T>::multiput((size_t)nn*(nn+1)/2,fd,v,dim);
 }
 /***************************************************************************//**
@@ -89,7 +89,7 @@ void NRSMat<T>::get(int fd, bool dim, bool transp) {
 	}else{
 		copyonwrite();
 	}
-	LA_traits<T>::multiget(NN2,fd,v,dim);
+	LA_traits<T>::multiget((size_t)nn*(nn+1)/2,fd,v,dim);
 }
--- a/smat.h
+++ b/smat.h
@@ -159,6 +159,7 @@ public:
 	void clear() {copyonwrite(); LA_traits<T>::clear(v,NN2);}; //zero out
 	void resize(const int n);
 	void dealloc(void) {resize(0);}
 	inline operator T*();
 	inline operator const T*() const;
--- a/sparsemat.cc
+++ b/sparsemat.cc
@@ -1245,6 +1245,9 @@ return *this;
 /*
 Commented out by Roman for ICC
 #define INSTANTIZE(T) \
 template SparseMat<T> & SparseMat<T>::oplusequal(const SparseMat<T> &rhs);\
 template SparseMat<T> & SparseMat<T>::oplusequal(const NRMat<T> &rhs);\
@@ -1291,9 +1294,8 @@ template void SparseMat<T>::permuteindices(const NRVec<SPMatindex> &p);\
 INSTANTIZE(double)
 INSTANTIZE(complex<double>) //some functions are not OK for hermitean matrices, needs a revision!!!
-
+*/
 //////////////////////////////////////////////////////////////////////////////
 //// forced instantization in the corresponding object file
--- a/sparsemat.h
+++ b/sparsemat.h
@@ -120,6 +120,7 @@ public:
 	void get(int fd, bool dimensions=1, bool transposed=false);
 	void put(int fd, bool dimensions=1, bool transposed=false) const;
 	void resize(const SPMatindex n, const SPMatindex m); //destructive
        void dealloc(void) {resize(0,0);}
 	void incsize(const SPMatindex n, const SPMatindex m); //increase size without destroying the data
 	void transposeme();
 	const SparseMat transpose() const;
--- a/sparsesmat.cc
+++ b/sparsesmat.cc
@@ -257,6 +257,36 @@ if(divide)
 return divide?NULL:&r[0];
 }
 template <class T>
 SparseSMat<T> SparseSMat<T>::submatrix(const int fromrow, const int torow, const int fromcol, const int tocol) const
 {
 #ifdef DEBUG
        if(fromrow<0 || fromrow>=nn|| torow<0 || torow>=nn || fromcol<0 || fromcol>=mm || tocol<0 || tocol>=mm || fromrow>torow || fromcol>tocol){
                laerror("invalid submatrix specification");
        }
 #endif
        const int m = tocol - fromcol + 1;
 	const int n = torow - fromrow + 1;
        SparseSMat<T> result(n, m);
 	typename SparseSMat<T>::iterator p(*this);
 	for(; p.notend(); ++p) 
 	    if(p->row>=fromrow && p->row<= torow && p->col >= fromcol && p->col <= tocol)
 		result.add(p->row-fromrow, p->col-fromcol, p->elem, false);
 return result;
 }
 template <class T>
 void SparseSMat<T>::storesubmatrix(const int fromrow, const int fromcol, const SparseSMat<T> &rhs)
 {
        const int tocol = fromcol + rhs.ncols() - 1;
        const int torow = fromrow + rhs.nrows() - 1;
 #ifdef DEBUG
        if(fromrow<0 || fromrow>=nn || torow>=nn || fromcol<0 || fromcol>=mm || tocol>=mm) laerror("bad indices in storesubmatrix");
 #endif
 	typename SparseSMat<T>::iterator p(rhs);
 	for(; p.notend(); ++p) add(p->row+fromrow, p->col+fromcol, p->elem, false);
 }
 template <class T>
@@ -305,6 +335,7 @@ void SparseSMat<T>::put(int fd, bool dimen, bool transp) const {
 /* Commented out by Roman for ICC
 #define INSTANTIZE(T) \
 template void SparseSMat<T>::gemm(const T beta, const SparseSMat &a, const char transa, const SparseSMat &b, const char transb, const T alpha); \
@@ -321,8 +352,8 @@ template void SparseSMat<T>::put(int fd, bool dimen, bool transp) const; \
 INSTANTIZE(double)
 INSTANTIZE(complex<double>) 
 */
 //// forced instantization of functions in the header in the corresponding object file
 template class SparseSMat<double>;
--- a/sparsesmat.h
+++ b/sparsesmat.h
@@ -61,9 +61,11 @@ public:
 	explicit SparseSMat(const SparseMat<T> &rhs);
 	explicit SparseSMat(const NRSMat<T> &rhs);
 	explicit SparseSMat(const NRMat<T> &rhs);
 	explicit SparseSMat(const CSRMat<T> &rhs);
 	SparseSMat & operator=(const SparseSMat &rhs);
 	void copyonwrite();
        void resize(const SPMatindex nn, const SPMatindex mm);
   	void dealloc(void) {resize(0,0);}
 	inline void setcoldim(int i) {mm=(SPMatindex)i;};
 	//std::map<SPMatindex,T> *line(SPMatindex n) const {return v[n];};
 	typedef std::map<SPMatindex,T> *ROWTYPE;
@@ -100,6 +102,8 @@ public:
 	int nrows() const {return nn;}
 	int ncols() const {return mm;}
 	SparseSMat<T>  cholesky(void) const;
 	SparseSMat submatrix(const int fromrow, const int torow, const int fromcol, const int tocol) const;
 	void storesubmatrix(const int fromrow, const int fromcol, const SparseSMat &rhs);
 	class iterator {//not efficient, just for output to ostreams
        private:
--- a/t.cc
+++ b/t.cc
@@ -62,9 +62,10 @@ NRVec<double> x(1.,10);
 NRVec<double> y(2.,10);
 NRVec<double> z(-2.,10);
-cout.setf(ios::scientific);
+//cout.setf(ios::scientific);
-//cc:cout.setf(ios::fixed);
+cc:cout.setf(ios::fixed);
-cout.precision(12);
+cout.precision(10);
 cin.exceptions ( ifstream::eofbit | ifstream::failbit | ifstream::badbit );
 if(0) test(x);
@@ -415,6 +416,50 @@ cout <<v.transpose(1)*u;
 }
 if(0)
 {
 NRMat<double> a;
 cin >>a;
 int n=a.nrows();
 NRMat<complex<double> > u(n,n),v(n,n);
 NRVec<complex<double> >w(n);
 gdiagonalize(a,w,&u,&v,0,n,0,1);
 cout <<u;
 cout <<w;
 cout <<v;
 NRVec<complex<double> >z=diagofproduct(u,v,1,1);
 cout <<z;
 for(int i=0;i<a.nrows();++i) w[i]/=z[i];//account for normalization of eigenvectors
 cout <<u*v.transpose(1); //check biorthonormality
 u.diagmultl(w);
 cout <<v.transpose(1)*u;
 }
 if(1)
 {
 NRMat<complex<double> > a;
 cin >>a;
 int n=a.nrows();
 NRMat<complex<double> > u(n,n),v(n,n);
 NRVec<complex<double> >w(n);
 gdiagonalize(a,w,&u,&v,0,n,0,1);
 cout <<u;
 cout <<w;
 cout <<v;
 NRVec<complex<double> >z=diagofproduct(u,v,1,1);
 cout <<z;
 for(int i=0;i<a.nrows();++i) w[i]/=z[i];//account for normalization of eigenvectors
 cout <<u*v.transpose(1); //check biorthonormality
 u.diagmultl(w);
 cout <<v.transpose(1)*u;
 }
 if(0)
 {
@@ -773,8 +818,42 @@ for(int i=1; i<4;i++) b=b*b;
 if(0)
 {
-NRMat<double> a;
+NRMat<double> aa,bb,cc;
 cin >>aa;
 cc=copytest(aa);
 cout <<cc;
 NRMat<complex<double> > a,b,c;
 a=complexify(aa);
 c=copytest(a);
 cout <<c;
 b=log(a);
 cout <<b;
 cout <<exp(b);
 }
 if(0)
 {
 NRMat<complex<double> > a,b,c;
 cin>>a;
 c=copytest(a);
 cout <<c;
 b=log(a);
 cout <<b;
 cout <<exp(b);
 }
 if(0)
 {
 NRMat<double> a,b,c;
 cin >>a;
 c=copytest(a);
 cout <<c;
 }
 if(0)
 {
 NRMat<double> a;
 NRMat<double> b=exp(a);
 NRMat<double> c=log(b);
 cout <<a;
@@ -1163,6 +1242,7 @@ cout <<a.oplus(b);
 cout <<a.otimes(b);
 }
 //test of general square matrix eigenvector derivatives
 if(0)
 {
@@ -1236,21 +1316,28 @@ tmp.diagonalof(wrg);
 for(int i=0; i<n; ++i)
        for(int j=0; j<n; ++j)
 		if(i!=j) tmp(i,j) /= (wr[j] - wr[i]); else  tmp(i,j) = 0.;
-cout <<" old X matrix \n"<<tmp<<endl;
+cout <<" old X matrix (tmp) \n"<<tmp<<endl;
 NRMat<double> Y = tmp;
 NRMat<double> S = vr.transpose() * vr;
 cout <<"test S\n"<<S;
 NRMat<double> tmp2 = S * tmp;
 cout <<"test tmp2\n"<<tmp2;
 Y.copyonwrite();
 for(int i=0; i<n; ++i) Y(i,i) -= tmp2(i,i);
 cout <<"Y matrix \n"<< Y;
-NRMat<double> numX = inverse(vr) * vrd;
+NRMat<double> vri = inverse(vr);
 NRMat<double> numX = vri * vrd;
 cout <<" numerical X matrix \n"<< numX;
 cout <<" numerical X matrix test = "<< (vr * numX - vrd).norm()<<endl;
 vrg = vr * Y;
 vlg = - (Y*vri).transpose();
 //and compare
 cout <<"eigenvalue numerical derivative\n"<<wrd<<endl;
 cout <<"eigenvalue analytic derivative\n"<<wrg<<endl;
@@ -1261,9 +1348,17 @@ cout <<"right eigenvector  numerical derivative\n"<<vrd<<endl;
 cout <<"right eigenvector  analytic derivative\n"<<vrg<<endl;
 cout <<"right eigenvector derivative error = "<<(vrd-vrg).norm()<<endl;
 //and for left eigenvectors
 cout <<"left eigenvector  numerical derivative\n"<<vld<<endl;
 cout <<"left eigenvector  analytic derivative\n"<<vlg<<endl;
 cout <<"left eigenvector derivative error = "<<(vld-vlg).norm()<<endl;
 }
 //@@@@@@@make this derivative check in complex version
 if(0)
 {
 try { laerror("test catch exception"); }
@@ -1415,7 +1510,9 @@ cout <<"symmetry error "<<(het-he).norm()<<endl;
 if(0)
 {
-NRSMat<double> hd(100);
+int n;
 cin >>n;
 NRSMat<double> hd(n);
 hd.randomize(1);
 SparseSMat<double> h(hd);
 NRMat<double> rd = hd*hd;
@@ -1425,14 +1522,22 @@ NRMat<double> r2(rx);
 cout <<"Error = "<<(r2-rd).norm()<<endl;
 }
 if(0)
 {
-SparseSMat<complex<double> > h;
+SparseSMat<double> h0;
-cin>>h;
+cin>>h0;
-h *= complex<double>(0,1);
+cout <<"matrix read\n"; cout.flush();
 SparseSMat<double> h1 = h0; //.submatrix(0,2047,0,2047);
 SparseSMat<complex<double> > h = imagmatrix(h1);
 double t=clock()/((double) (CLOCKS_PER_SEC));
-SparseSMat<complex<double> > r = exp(h);
+SparseSMat<complex<double> > r = h*h;
-cout <<"SparseSMat time "<<clock()/((double) (CLOCKS_PER_SEC))-t <<"\n";
+cout <<"SparseSMat mult time "<<clock()/((double) (CLOCKS_PER_SEC))-t <<"\n";
 cout.flush();
 t=clock()/((double) (CLOCKS_PER_SEC));
 r = exp(h);
 cout <<"SparseSMat exp time "<<clock()/((double) (CLOCKS_PER_SEC))-t <<"\n";
 cout.flush();
 if(h.nrows()<=1024)
 {
 NRSMat<complex<double> > h3(h);
@@ -1443,6 +1548,7 @@ cout <<"errorx = "<<(r2-NRSMat<complex<double> >(r)).norm()<<endl;
 }
 }
 if(0)
 {
 int n;
@@ -1598,7 +1704,7 @@ cgpu.moveto(cpu);
 cout << "Error = "<<(c-cgpu).norm()<<endl;
 }
-if(1)
+if(0)
 {
 int n;
 cin >>n;
@@ -1627,6 +1733,31 @@ c.moveto(gpu1);
 cout << "Error = "<<(c-cgpu).norm()<<endl;
 }
 /*
 if(0)
 {
 CSRMat<double> h0;
 cin>>h0;
 cout <<"matrix read\n"; cout.flush();
 CSRMat<double> h1 = h0; 
 CSRMat<complex<double> > h = imagmatrix(h1);
 double t=clock()/((double) (CLOCKS_PER_SEC));
 CSRMat<complex<double> > r = h*h;
 cout <<"CSRMat mult time "<<clock()/((double) (CLOCKS_PER_SEC))-t <<"\n";
 cout.flush();
 t=clock()/((double) (CLOCKS_PER_SEC));
 r = exp(h);
 cout <<"CSRMat exp time "<<clock()/((double) (CLOCKS_PER_SEC))-t <<"\n";
 cout.flush();
 if(h.nrows()<=1024)
 {
 NRMat<complex<double> > h2(h);
 NRMat<complex<double> >r2 = exp(h2);
 cout <<"error = "<<(r2-NRMat<complex<double> >(r)).norm()<<endl;
 }
 }
 */
--- a/vec.cc
+++ b/vec.cc
@@ -793,14 +793,14 @@ NRVec<complex<double> >::otimes(const NRVec<complex<double> > &b, const bool con
 		if(conj){
 			const cuDoubleComplex alpha = make_cuDoubleComplex(scale.real(), -scale.imag());
-			cublasZgerc(b.nn, nn, alpha, (cuDoubleComplex*)(b.v), 1, (cuDoubleComplex*)(v), 1, (cuDoubleComplex*)(result[0]), 1);
+			cublasZgerc(b.nn, nn, alpha, (cuDoubleComplex*)(b.v), 1, (cuDoubleComplex*)(v), 1, (cuDoubleComplex*)(result[0]), b.nn);
 			TEST_CUBLAS("cublasZgerc");
 			result.conjugateme();
 		}else{
 			const cuDoubleComplex alpha = make_cuDoubleComplex(scale.real(), +scale.imag());
-			cublasZgeru(b.nn, nn, alpha, (cuDoubleComplex*)(b.v), 1, (cuDoubleComplex*)(v), 1, (cuDoubleComplex*)(result[0]), 1);
+			cublasZgeru(b.nn, nn, alpha, (cuDoubleComplex*)(b.v), 1, (cuDoubleComplex*)(v), 1, (cuDoubleComplex*)(result[0]), b.nn);
 			TEST_CUBLAS("cublasZgeru");
 		}
 	}
@@ -839,6 +839,9 @@ NRVec<complex<double> > complexify(const NRVec<double> &rhs) {
 /***************************************************************************//**
 * forced instantization in the corespoding object file
 ******************************************************************************/
 /*
 Commented out by Roman for ICC
 #define INSTANTIZE(T) \
 template void NRVec<T>::put(int fd, bool dim, bool transp) const; \
 template void NRVec<T>::get(int fd, bool dim, bool transp); \
@@ -855,19 +858,7 @@ INSTANTIZE(unsigned short)
 INSTANTIZE(unsigned int)
 INSTANTIZE(unsigned long)
 INSTANTIZE(unsigned long long)
-
+*/
 template class NRVec<double>;
 template class NRVec<complex<double> >;
 template class NRVec<char>;
 template class NRVec<short>;
 template class NRVec<int>;
 template class NRVec<long>;
 template class NRVec<long long>;
 template class NRVec<unsigned char>;
 template class NRVec<unsigned short>;
 template class NRVec<unsigned int>;
 template class NRVec<unsigned long>;
 template class NRVec<unsigned long long>;
 #define INSTANTIZE_DUMMY(T) \
 template<> void NRVec<T>::gemv(const T beta, const NRMat<T> &a, const char trans,  const T alpha, const NRVec<T> &x) { laerror("gemv on unsupported types"); } \
@@ -878,6 +869,11 @@ template<> void NRVec<T>::gemv(const  LA_traits_complex<T>::Component_type beta,
 template<> NRVec<T> & NRVec<T>::normalize(LA_traits<T>::normtype *) {laerror("normalize() impossible for integer types"); return *this;} \
 template<> const NRMat<T> NRVec<T>::otimes(const NRVec<T> &b,const bool conj, const T &scale) const {laerror("otimes presently implemented only for double and complex double"); return NRMat<T> ();}
 // Roman
 // following gemv are not implemented
 template<> void NRVec<double>::gemv(const double beta, const SparseMat<double> &a, const char trans,  const double alpha, const NRVec<double> &x, bool s) { laerror("gemv on unsupported types"); } 
 template<> void NRVec< complex<double> >::gemv(const complex<double> beta, const SparseMat< complex<double> > &a, const char trans,  const complex<double> alpha, const NRVec< complex<double> > &x, bool s) { laerror("gemv on unsupported types"); } 
 INSTANTIZE_DUMMY(char)
 INSTANTIZE_DUMMY(short)
@@ -902,4 +898,17 @@ INSTANTIZE_DUMMY(complex<unsigned long long>)
 INSTANTIZE_DUMMY(complex<complex<double> >)
 INSTANTIZE_DUMMY(complex<complex<float> >)
 template class NRVec<double>;
 template class NRVec<complex<double> >;
 template class NRVec<char>;
 template class NRVec<short>;
 template class NRVec<int>;
 template class NRVec<long>;
 template class NRVec<long long>;
 template class NRVec<unsigned char>;
 template class NRVec<unsigned short>;
 template class NRVec<unsigned int>;
 template class NRVec<unsigned long>;
 template class NRVec<unsigned long long>;
 }//namespace
--- a/vec.h
+++ b/vec.h
@@ -287,6 +287,9 @@ public:
 	//! resize the current vector
 	void resize(const int n);
 	//!deallocate the current vector
 	void dealloc(void) {resize(0);}
 	//! determine the norm of this vector 
 	inline const typename LA_traits<T>::normtype norm() const;