*** empty log message ***

2011-01-18 14:37:05 +00:00 · 2011-01-18 14:37:05 +00:00 · 4534c2e56a
commit 4534c2e56a
parent 600b5b3abd
21 changed files with 753 additions and 138 deletions
--- a/csrmat.cc
+++ b/csrmat.cc
@ -30,6 +30,9 @@ namespace LA {



+/*
+ Commented out by Roman for ICC
+
 #define INSTANTIZE(T) \
 template void CSRMat<T>::gemm(const T beta, const CSRMat &a, const char transa, const CSRMat &b, const char transb, const T alpha); \
 template CSRMat<T> & CSRMat<T>::operator*=(const T &a); \
@ -45,8 +48,8 @@ template void CSRMat<T>::put(int fd, bool dimen, bool transp) const; \


 INSTANTIZE(double)
-
 INSTANTIZE(complex<double>) 
+*/

 //// forced instantization of functions in the header in the corresponding object file
 template class CSRMat<double>;
--- a/csrmat.h
+++ b/csrmat.h
@ -71,6 +71,7 @@ public:
 	CSRMat & operator=(const CSRMat &rhs);
 	void copyonwrite();
        void resize(const SPMatindex nn, const SPMatindex mm);
+	void dealloc(void) {resize(0,0);}
 	void moveto(GPUID destination);
        void clear();
 	~CSRMat();
@ -128,5 +129,12 @@ public:
 	*/
 };

+template <typename T>
+std::ostream & operator<<(std::ostream &s, const CSRMat<T> &x);
+
+template <class T>
+std::istream& operator>>(std::istream  &s, CSRMat<T> &x);
+
+
 }//namespace
 #endif //_CSRMAT_H_
--- a/davidson.h
+++ b/davidson.h
@ -33,26 +33,21 @@ namespace LA {
 //therefore the whole implementation must be a template in a header
 //Note that for efficiency in a direct CI case the diagonalof() should cache its result

-template <typename T, typename Matrix>
-extern void davidson(const Matrix &bigmat, NRVec<T> &eivals, NRVec<T> *eivecs, const char *eivecsfile, 
-		int nroots=1,  const bool verbose=0, const double eps=1e-6,
-	 	const bool incore=1, int maxit=100, const int maxkrylov = 500,
-		void (*initguess)(NRVec<T> &)=NULL);

 //@@@options: left eigenvectors by matrix transpose, overridesymmetric (for nrmat)
 //@@@small matrix gdiagonalize - shift complex roots up (option to gdiagonalize?)
 //@@@test gdiagonalize whether it sorts the roots and what for complex ones
+//@@@implement left eigenvectors for nonsymmetric case


 //Davidson algorithm: J. Comp. Phys. 17:817 (1975) 

-//@@@implement left eigenvectors for nonsymmetric case

 template <typename T, typename Matrix>
-void davidson(const Matrix &bigmat, NRVec<T> &eivals, NRVec<T> *eivecs, const char *eivecsfile,
-                int nroots,  const bool verbose, const double eps, 
-                const bool incore, int maxit, const int maxkrylov,
-		void (*initguess)(NRVec<T> &))
+extern void davidson(const Matrix &bigmat, NRVec<T> &eivals, NRVec<T> *eivecs, const char *eivecsfile, 
+		int nroots=1,  const bool verbose=0, const double eps=1e-6,
+	 	const bool incore=1, int maxit=100, const int maxkrylov = 500,
+		void (*initguess)(NRVec<T> &)=NULL)
 {
 bool flag=0;
 int n=bigmat.nrows();
@ -87,7 +82,7 @@ smallH=0;


 //default guess based on lowest diagonal element of the matrix
-if(initguess) (*initguess)(vec1);
+if(initguess) initguess(vec1);
 else
 	{
 	const T *diagonal = bigmat.diagonalof(vec2,false,true);
--- a/fourindex.h
+++ b/fourindex.h
@ -226,6 +226,7 @@ public:
 	inline matel4<I,T> *getlist() const {return list;}
 	inline I size() const {return nn;}
 	void resize(const I n);
+	void dealloc(void) {resize(0);}
 	void copyonwrite();
 	unsigned long length() const;
 	inline void add(const I i, const I j, const I k, const I l, const T elem) 
--- a/la.h
+++ b/la.h
@ -42,6 +42,7 @@
 #include "smat.h"
 #include "sparsemat.h"
 #include "sparsesmat.h"
+#include "csrmat.h"
 #include "vec.h"

 using namespace LA;
--- a/la_traits.h
+++ b/la_traits.h
@ -220,8 +220,30 @@ static inline normtype norm (const  complex<C> &x) {return std::abs(x);}
 static inline void axpy (complex<C> &s, const complex<C> &x, const complex<C> &c) {s+=x*c;}
 static inline void get(int fd, complex<C> &x, bool dimensions=0, bool transp=0) {if(sizeof(complex<C>)!=read(fd,&x,sizeof(complex<C>))) laerror("read error");}
 static inline void put(int fd, const complex<C> &x, bool dimensions=0, bool transp=0) {if(sizeof(complex<C>)!=write(fd,&x,sizeof(complex<C>))) laerror("write error");}
-static void multiget(size_t n,int fd, complex<C> *x, bool dimensions=0){ssize_t r=read(fd,x,n*sizeof(complex<C>)); if((ssize_t)(n*sizeof(complex<C>))!=r) {std::cout<<"read returned "<<r<<std::endl; laerror("read error");}}
-static void multiput(size_t n, int fd, const complex<C> *x, bool dimensions=0) {ssize_t r=write(fd,x,n*sizeof(complex<C>)); if((ssize_t)(n*sizeof(complex<C>))!=r) {std::cout<<"write returned "<<r<<std::endl; laerror("write error");}}
+static void multiget(size_t n,int fd, complex<C> *x, bool dimensions=0)
+	{
+	size_t total=0;
+	ssize_t r;
+	do{
+		r=read(fd,x+total,(n-total)*sizeof(complex<C>)); 
+		if(r<0 || r==0 && n!=0 ) {std::cout<<"read returned "<<r<<std::endl; laerror("read error");}
+		else total += r/sizeof(complex<C>);
+		if(r%sizeof(complex<C>)) laerror("read error 2");
+	  }
+	while(total < n);
+	}
+static void multiput(size_t n, int fd, const complex<C> *x, bool dimensions=0) 
+	{
+	size_t total=0;
+	ssize_t r;
+	do{
+		r=write(fd,x+total,(n-total)*sizeof(complex<C>)); 
+		if(r<0 || r==0 && n!=0 ) {std::cout<<"write returned "<<r<<std::endl; laerror("write error");}
+		else total += r/sizeof(complex<C>);
+		if(r%sizeof(complex<C>)) laerror("write error 2");
+	  }
+	while(total < n);
+	}
 static void copy(complex<C> *dest, complex<C> *src, unsigned int n) {memcpy(dest,src,n*sizeof(complex<C>));}
 static void clear(complex<C> *dest, unsigned int n) {memset(dest,0,n*sizeof(complex<C>));}
 static void copyonwrite(complex<C> &x) {};
@ -232,6 +254,7 @@ static inline C realpart(const complex<C> &x) {return x.real();}
 static inline C imagpart(const complex<C> &x) {return x.imag();}
 };

+
 //non-complex scalars
 template<typename C>
 struct LA_traits_aux<C, scalar_true> {
@ -248,8 +271,30 @@ static inline normtype norm (const  C &x) {return std::abs(x);}
 static inline void axpy (C &s, const C &x, const C &c) {s+=x*c;}
 static inline void put(int fd, const C &x, bool dimensions=0, bool transp=0) {if(sizeof(C)!=write(fd,&x,sizeof(C))) laerror("write error");}
 static inline void get(int fd, C &x, bool dimensions=0, bool transp=0) {if(sizeof(C)!=read(fd,&x,sizeof(C))) laerror("read error");}
-static void multiget(size_t n,int fd, C *x, bool dimensions=0){ssize_t r=read(fd,x,n*sizeof(C)); if((ssize_t)(n*sizeof(C))!=r) {std::cout<<"read returned "<<r<<std::endl; laerror("read error");}}
-static void multiput(size_t n, int fd, const C *x, bool dimensions=0) {ssize_t r=write(fd,x,n*sizeof(C)); if((ssize_t)(n*sizeof(C))!=r) {std::cout<<"write returned "<<r<<std::endl; laerror("write error");}}
+static void multiget(size_t n,int fd, C *x, bool dimensions=0)
+	{
+	size_t total=0;
+	ssize_t r;
+	do{
+		r=read(fd,x+total,(n-total)*sizeof(C)); 
+		if(r<0 || r==0 && n!=0 ) {std::cout<<"read returned "<<r<<std::endl; laerror("read error");}
+		else total += r/sizeof(C);
+		if(r%sizeof(C)) laerror("read error 2");
+	  }
+	while(total < n);
+	}
+static void multiput(size_t n, int fd, const C *x, bool dimensions=0) 
+	{
+	size_t total=0;
+	ssize_t r;
+	do{
+		r=write(fd,x+total,(n-total)*sizeof(C)); 
+		if(r<0 || r==0 && n!=0 ) {std::cout<<"write returned "<<r<<std::endl; laerror("write error");}
+		else total += r/sizeof(C);
+		if(r%sizeof(C)) laerror("write error 2");
+	  }
+	while(total < n);
+	}
 static void copy(C *dest, C *src, unsigned int n) {memcpy(dest,src,n*sizeof(C));}
 static void clear(C *dest, unsigned int n) {memset(dest,0,n*sizeof(C));}
 static void copyonwrite(C &x) {};
--- a/laerror.h
+++ b/laerror.h
@ -30,9 +30,9 @@ class LAerror
 	};

 #ifdef __GNUG__
-	#define laerror(X) { LA::laerror2(X, __PRETTY_FUNCTION__); }
+	#define laerror(X) LA::laerror2(X, __PRETTY_FUNCTION__)
 #else
-	#define laerror(X) { LA::laerror2(X, __func__); }
+	#define laerror(X) LA::laerror2(X, __func__)
 #endif

 extern void laerror2(const char *, const char *);
--- a/mat.cc
+++ b/mat.cc
@ -150,7 +150,7 @@ void NRMat<T>::put(int fd, bool dim, bool transp) const {
 			}
 		}
        }else{
-		LA_traits<T>::multiput(nn*mm,fd,
+		LA_traits<T>::multiput((size_t)nn*(size_t)mm,fd,
 		#ifdef MATPTR
 		        v[0]
 		#else
@ -202,7 +202,7 @@ void NRMat<T>::get(int fd, bool dim, bool transp){
 			}
 		}
 	}else{
-		LA_traits<T>::multiget(nn*mm,fd,
+		LA_traits<T>::multiget((size_t)nn*(size_t)mm,fd,
 		#ifdef MATPTR
 		        v[0]
 		#else
@ -838,8 +838,9 @@ NRMat<T>& NRMat<T>::transposeme(const int _n) {
 	return *this;
 }

+
 /***************************************************************************//**
- * icreate complex double-precision matrix from real double-precision matrix \f$A\f$
+ * create complex double-precision matrix from real double-precision matrix \f$A\f$
 * @param[in] rhs real double-precision matrix \f$A\f$
 * @param[in] imagpart flag indicating whether the matrix \f$A\f$ should be considered as a real
 *  or imaginary part of the complex matrix being created
@ -877,6 +878,43 @@ NRMat<complex<double> >::NRMat(const NRMat<double> &rhs, bool imagpart): nn(rhs.
 #endif
 }

+
+
+/***************************************************************************//**
+ * create double-precision matrix from complex double-precision matrix \f$A\f$
+ * @param[in] rhs complex double-precision matrix \f$A\f$
+ * @param[in] imagpart flag indicating whether the matrix \f$A\f$ should be taken as the real
+ *  or imaginary part of the input complex matrix 
+ ******************************************************************************/
+template<>
+NRMat<double>::NRMat(const NRMat<complex<double> > &rhs, bool imagpart): nn(rhs.nrows()), mm(rhs.ncols()), count(new int(1)) {
+	const int nn_mm = nn*mm;
+#ifdef CUDALA
+	if(location == cpu){
+#endif
+	#ifdef MATPTR
+	        v = new double*[n];
+	        v[0] = new double[nn_mm];
+	        for(register int i=1; i<n; i++) v[i] = v[i-1] + m;
+	
+	        cblas_dcopy(nn_mm, ((double *)&rhs[0][0]) + (imagpart?1:0), 2, v[0], 1);
+	#else
+	        v = new double[nn_mm];
+		cblas_dcopy(nn_mm, ((double *) &rhs[0][0]) + (imagpart?1:0), 2, v , 1);
+	#endif
+#ifdef CUDALA
+	}else{
+		v = (double *)gpualloc(sizeof(double)*nn_mm);
+		cublasDcopy(nn_mm, ((double*)&rhs[0][0])+ (imagpart?1:0), 2, v , 1);
+		TEST_CUBLAS("cublasDcopy");
+	}
+#endif
+}
+
+
+
+
+
 /***************************************************************************//**
 * output of a matrix of general type via lawritemat
 ******************************************************************************/
@ -1156,8 +1194,9 @@ void NRMat<complex<double> >::randomize(const double &x) {
 #endif
 		for(register int i=0; i<nn; ++i){
 			for(register int j=0; j<mm; ++j){
-				(*this)(i,j).real() = x*(2.*random()/(1. + RAND_MAX) - 1.);
-				(*this)(i,j).imag() = x*(2.*random()/(1. + RAND_MAX) - 1.);
+				const double re = x*(2.*random()/(1. + RAND_MAX) - 1.);
+				const double im = x*(2.*random()/(1. + RAND_MAX) - 1.);
+				(*this)(i,j) = complex<double>(re, im);
 			}
 		}
 #ifdef CUDALA
--- a/mat.h
+++ b/mat.h
@ -80,6 +80,8 @@ public:

 	//! complexifying constructor 
 	NRMat(const typename LA_traits_complex<T>::NRMat_Noncomplex_type &rhs, bool imagpart = false);
+	//! explicit decomplexifying constructor
+	explicit NRMat(const NRMat<complex<T> > &rhs, bool imagpart = false);

 	//! explicit constructor converting symmetric matrix stored in packed form into a <code>NRMat<T></code> object
 	explicit NRMat(const NRSMat<T> &rhs);
@ -280,6 +282,9 @@ public:
 	//! resize the matrix
 	void resize(int n, int m);

+	//! deallocate the matrix
+	void dealloc(void) {resize(0,0);}
+
 	//! get the pointer to the data
 	inline operator T*();
 	//! get the const pointer to the data
@ -332,6 +337,8 @@ public:
 	explicit NRMat(const SparseMat<T> &rhs);                // dense from sparse
 	//! explicit constructor converting sparse symmetric matrix into \c NRMat<T> object
 	explicit NRMat(const SparseSMat<T> &rhs);
+	//! explicit constructor converting sparse CSR matrix into \c NRMat<T> object
+        explicit NRMat(const CSRMat<T> &rhs);

 	//! add up given sparse matrix
 	NRMat & operator+=(const SparseMat<T> &rhs);
@ -618,7 +625,6 @@ inline T* NRMat<T>::operator[](const int i) {
 	if (i < 0 || i >= nn) laerror("Mat [] out of range");
 	if (!v) laerror("unallocated matrix");
 #endif
-	NOT_GPU(*this);
 	#ifdef MATPTR
 		return v[i];
 	#else
--- a/matexp.h
+++ b/matexp.h
@ -42,8 +42,8 @@ else
 	for(i=order-1; i>=0; i--)
 		{
 		//std::cerr<<"TEST polynom0 "<<i<<'\n';
-		if(i<order-1) z=y*x;
-		y=z+c[i];
+		if(i<order-1) {LA_traits<T>::deallocate(z); z=y*x;} //for large matrices avoid storing 4 ones simultaneously
+		LA_traits<T>::deallocate(y); y=z+c[i];
 		}
 	}

@ -346,9 +346,11 @@ int power;
 NRVec<typename LA_traits<V>::normtype> taylor2=exp_aux<M,typename LA_traits<V>::normtype>(mat,power,maxpower,maxtaylor,scale);

 V tmp;
+bool washere=0;

 for(int i=1; i<=(1<<power); ++i) //unfortunatelly, here we have to repeat it many times, unlike if the matrix is stored explicitly
 	{
+	washere=1;
 	if(i>1) rhs=result; //apply again to the result of previous application
 	else result=rhs;
 	tmp=rhs; //now rhs can be used as scratch	
@ -361,6 +363,8 @@ for(int i=1; i<=(1<<power); ++i) //unfortunatelly, here we have to repeat it man
 		}
 	}

+if(!washere) laerror("integer overflow due to unrealistically big power - use maxpower argument in exptimes()");
+
 return;
 }

--- a/nonclass.cc
+++ b/nonclass.cc
@ -240,6 +240,41 @@ linear_solve_do(a,&B[0],1,a.nrows(),det,n);
 }


+// Roman, complex version of linear_solve()
+extern "C" void FORNAME(zgesv)(const int *N, const int *NRHS, double *A, const int *LDA,
+           int *IPIV, double *B, const int *LDB, int *INFO);
+
+void linear_solve(NRMat< complex<double> > &A, NRMat< complex<double> > *B, complex<double> *det, int n)
+{
+        int r, *ipiv;
+
+        if (A.nrows() != A.ncols()) laerror("linear_solve() call for non-square matrix");
+        if (B && A.nrows() != B->ncols()) laerror("incompatible matrices in linear_solve()");
+        A.copyonwrite();
+        if (B) B->copyonwrite();
+        ipiv = new int[A.nrows()];
+  n = A.nrows();
+  int nrhs = B ? B->nrows() : 0;
+  int lda = A.ncols();
+  int ldb = B ? B->ncols() : A.nrows();
+        FORNAME(zgesv)(&n, &nrhs, (double *)A[0], &lda, ipiv,
+                 B ? (double *)(*B)[0] : (double *)0, &ldb, &r);
+        if (r < 0) {
+                delete[] ipiv;
+                laerror("illegal argument in lapack_gesv");
+        }
+        if (det && r>=0) {
+                *det = A[0][0];
+                for (int i=1; i<A.nrows(); ++i) *det *= A[i][i];
+                //change sign of det by parity of ipiv permutation
+                for (int i=0; i<A.nrows(); ++i) *det = -(*det);
+        }
+        delete [] ipiv;
+        if (r>0 && B) laerror("singular matrix in zgesv");
+}
+
+
+
 //other version of linear solver based on gesvx

 //------------------------------------------------------------------------------
@ -793,6 +828,18 @@ extern "C" void FORNAME(dggev)(const char *JOBVL, const char *JOBVR, const FINT
 		 double *VL, const FINT *LDVL,  double *VR, const FINT *LDVR,  
 		double *WORK, const FINT *LWORK, FINT *INFO );

+extern "C" void FORNAME(zgeev)(const char *JOBVL, const char *JOBVR, const FINT *N,
+                complex<double> *A, const FINT *LDA, complex<double> *W, complex<double> *VL, const FINT *LDVL,
+                complex<double> *VR, const FINT *LDVR, complex<double> *WORK, const FINT *LWORK,
+		 double *RWORK, FINT *INFO );
+
+extern "C" void FORNAME(zggev)(const char *JOBVL, const char *JOBVR, const FINT *N,
+                complex<double> *A, const FINT *LDA, complex<double> *B, const FINT *LDB, complex<double> *W, complex<double> *WBETA,
+                complex<double> *VL, const FINT *LDVL,  complex<double> *VR, const FINT *LDVR,
+                complex<double> *WORK, const FINT *LWORK, double *RWORK, FINT *INFO );
+
+
+

 //statics for sorting
 static int *gdperm;
@ -904,11 +951,12 @@ void gdiagonalize(NRMat<double> &a, NRVec<double> &wr, NRVec<double> &wi,
 #endif
 	delete[] work;

-//std::cout <<"TEST dgeev\n"<<wr<<wi<<*vr<<*vl<<std::endl;

 	if (r < 0) laerror("illegal argument in ggev/geev in gdiagonalize()");
 	if (r > 0) laerror("convergence problem in ggev/geev in gdiagonalize()");

+//std::cout <<"TEST dgeev\n"<<wr<<wi<<*vr<<*vl<<std::endl;
+
 	if(biorthonormalize && vl && vr)
 		{
 		if(b || beta) laerror("@@@ biorthonormalize not implemented yet for generalized non-symmetric eigenproblem");//metric b would be needed
@ -968,6 +1016,7 @@ void gdiagonalize(NRMat<double> &a, NRVec<double> &wr, NRVec<double> &wi,
 			}
 		}

+
 	if(sorttype>0)
 		{
 		NRVec<int> perm(n);
@ -997,12 +1046,119 @@ void gdiagonalize(NRMat<double> &a, NRVec<double> &wr, NRVec<double> &wi,
 }


+
+//most general complex routine
+template<>
+void gdiagonalize(NRMat<complex<double> > &a, NRVec< complex<double> > &w,
+		NRMat< complex<double> >*vl, NRMat< complex<double> > *vr,
+		const bool corder, int n, const int sorttype, const int biorthonormalize,
+		NRMat<complex<double> > *b, NRVec<complex<double> > *beta)
+{
+
+	if(n<=0) n = a.nrows();
+	if (n > a.ncols() || n>a.nrows() ) laerror("gdiagonalize() call for a non-square matrix");
+	if (n > w.size()) 
+		laerror("inconsistent dimension of eigen vector in gdiagonalize()");
+	if (vl) if (n > vl->nrows() || n > vl->ncols())
+		laerror("inconsistent dimension of vl in gdiagonalize()");
+	if (vr) if (n > vr->nrows() || n > vr->ncols())
+		laerror("inconsistent dimension of vr in gdiagonalize()");
+	if (beta) if(n > beta ->size()) laerror("inconsistent dimension of beta in gdiagonalize()");
+	if(b) if(n > b->nrows() || n > b->ncols())
+		 laerror("inconsistent dimension of b in gdiagonalize()");
+	if(b && !beta || beta && !b) laerror("missing array for generalized diagonalization");
+
+	a.copyonwrite();
+	w.copyonwrite();
+	if (vl) vl->copyonwrite();
+	if (vr) vr->copyonwrite();
+	if (beta) beta->copyonwrite();
+	if (b) b->copyonwrite();
+	
+	char jobvl = vl ? 'V' : 'N';
+	char jobvr = vr ? 'V' : 'N';
+	complex<double> work0;
+	FINT lwork = -1;
+	FINT r;
+	FINT lda=a.ncols();
+	FINT ldb=0;
+	if(b) ldb=b->ncols();
+	FINT ldvl= vl?vl->ncols():lda;
+	FINT ldvr= vr?vr->ncols():lda;
+
+	double *rwork = new double[n*(b?8:2)];
+
+#ifdef FORINT
+        FINT ntmp = n; 
+        if(b)  FORNAME(zggev)(&jobvr, &jobvl, &ntmp, a, &lda, *b, &ldb, w, *beta, vr?vr[0]:(complex<double> *)0,
+                        &ldvr, vl?vl[0]:(complex<double> *)0, &ldvl, &work0, &lwork, rwork, &r);
+	else FORNAME(zgeev)(&jobvr, &jobvl, &ntmp, a, &lda, w, vr?vr[0]:(complex<double> *)0,
+			&ldvr, vl?vl[0]:(complex<double> *)0, &ldvl, &work0, &lwork, rwork, &r);
+#else
+        if(b)  FORNAME(zggev)(&jobvr, &jobvl, &n, a, &lda, *b, &ldb, w, *beta, vr?vr[0]:(complex<double> *)0,
+                        &ldvr, vl?vl[0]:(complex<double> *)0, &ldvl, &work0, &lwork, rwork, &r);
+	else FORNAME(zgeev)(&jobvr, &jobvl, &n, a, &lda, w, vr?vr[0]:(complex<double> *)0,
+			&ldvr, vl?vl[0]:(complex<double> *)0, &ldvl, &work0, &lwork, rwork, &r);
+#endif
+
+        lwork = (FINT) work0.real();
+	complex<double> *work = new complex<double>[lwork];
+
+#ifdef FORINT
+        if(b)  FORNAME(zggev)(&jobvr, &jobvl, &ntmp, a, &lda, *b, &ldb, w, *beta, vr?vr[0]:(complex<double> *)0,
+                        &ldvr, vl?vl[0]:(complex<double> *)0, &ldvl, work, &lwork, rwork, &r);
+	else FORNAME(zgeev)(&jobvr, &jobvl, &ntmp, a, &lda, w, vr?vr[0]:(complex<double> *)0,
+			&ldvr, vl?vl[0]:(complex<double> *)0, &ldvl, work, &lwork, rwork, &r);
+#else
+        if(b)  FORNAME(zggev)(&jobvr, &jobvl, &n, a, &lda, *b, &ldb, w, *beta, vr?vr[0]:(complex<double> *)0,
+                        &ldvr, vl?vl[0]:(complex<double> *)0, &ldvl, work, &lwork, rwork, &r);
+	else FORNAME(zgeev)(&jobvr, &jobvl, &n, a, &lda, w, vr?vr[0]:(complex<double> *)0,
+			&ldvr, vl?vl[0]:(complex<double> *)0, &ldvl, work, &lwork, rwork, &r);
+#endif
+
+	delete[] work;
+	delete[] rwork;
+
+//std::cout <<"TEST zg(g|e)ev\n"<<w<<*vr<<*vl<<std::endl;
+
+	if (r < 0) laerror("illegal argument in ggev/geev in gdiagonalize()");
+	if (r > 0) laerror("convergence problem in ggev/geev in gdiagonalize()");
+
+	if(biorthonormalize && vl && vr)
+		{
+		if(b || beta) laerror("@@@ biorthonormalize not implemented yet for generalized non-hermitian eigenproblem");//metric b would be needed
+		for(int i=0; i<n; ++i)
+			{
+			//calculate scaling paramter
+			complex<double> tmp;
+			cblas_zdotc_sub(n,(*vr)[i],1,(*vl)[i], 1, &tmp);
+			tmp = 1./tmp;
+			std::cout <<"scaling by "<<tmp<<"\n";
+			if(biorthonormalize==1) cblas_zscal(n,&tmp,(*vl)[i],1);
+			if(biorthonormalize==2) cblas_zscal(n,&tmp,(*vr)[i],1);
+			}
+		}
+
+	if(sorttype>0)
+		{
+		laerror("sorting not implemented in complex gdiagonalize");
+		}
+
+
+	if (corder) {
+		if (vl) vl->transposeme(n);
+		if (vr) vr->transposeme(n);
+	}
+
+}
+
+
+template<>
 void gdiagonalize(NRMat<double> &a, NRVec< complex<double> > &w,
 		NRMat< complex<double> >*vl, NRMat< complex<double> > *vr,
 		const bool corder, int n, const int sorttype, const int biorthonormalize,
 		NRMat<double> *b, NRVec<double> *beta)
 {
-	if(!corder)  laerror("gdiagonalize() corder 0 not implemented");
 	if(n<=0)  n = a.nrows();
 	if(n> a.nrows() || n ==  a.nrows() && n != a.ncols()) laerror("gdiagonalize() call for a non-square matrix");

@ -1020,19 +1176,43 @@ void gdiagonalize(NRMat<double> &a, NRVec< complex<double> > &w,
 		i = 0;
 		while (i < n) {
 			if (wi[i] == 0) {
+			    if(corder)
+				{
+                                if (vl) for (int j=0; j<n; j++) (*vl)[j][i] = (*rvl)[i][j];
+                                if (vr) for (int j=0; j<n; j++) (*vr)[j][i] = (*rvr)[i][j];
+				}
+			    else
+				{
 				if (vl) for (int j=0; j<n; j++) (*vl)[i][j] = (*rvl)[i][j];
 				if (vr) for (int j=0; j<n; j++) (*vr)[i][j] = (*rvr)[i][j];
+				}
 				i++;
 			} else {
 				if (vl)
 					for (int j=0; j<n; j++) {
+					    if(corder)
+						{
+                                                (*vl)[j][i] = complex<double>((*rvl)[i][j], (*rvl)[i+1][j]);
+                                                (*vl)[j][i+1] = complex<double>((*rvl)[i][j], -(*rvl)[i+1][j]);
+						}
+						else
+						{
 						(*vl)[i][j] = complex<double>((*rvl)[i][j], (*rvl)[i+1][j]);
 						(*vl)[i+1][j] = complex<double>((*rvl)[i][j], -(*rvl)[i+1][j]);
+						}
 					} 
 				if (vr)
 					for (int j=0; j<n; j++) {
+					    if(corder)
+                                                {
+                                                (*vr)[j][i] = complex<double>((*rvr)[i][j], (*rvr)[i+1][j]);
+                                                (*vr)[j][i+1] = complex<double>((*rvr)[i][j], -(*rvr)[i+1][j]);
+                                                }
+                                                else
+                                                {
 						(*vr)[i][j] = complex<double>((*rvr)[i][j], (*rvr)[i+1][j]);
 						(*vr)[i+1][j] = complex<double>((*rvr)[i][j], -(*rvr)[i+1][j]);
+						}
 					}
 				i += 2;
 			}
@ -1043,35 +1223,78 @@ void gdiagonalize(NRMat<double> &a, NRVec< complex<double> > &w,
 }


-const NRMat<double> realpart(const NRMat< complex<double> > &a)
+template<>
+const NRMat<double> realpart<NRMat< complex<double> > >(const NRMat< complex<double> > &a)
 {
+#ifdef CUDALA
+        if(location == cpu){
+#endif
 	NRMat<double> result(a.nrows(), a.ncols());
 	cblas_dcopy(a.nrows()*a.ncols(), (const double *)a[0], 2, result, 1);
+#ifdef CUDALA
+        }else{
+	laerror("not implemented for cuda yet");
+	}
+#endif
 	return result;
 }

-const NRMat<double> imagpart(const NRMat< complex<double> > &a)
+template<>
+const NRMat<double> imagpart<NRMat< complex<double> > >(const NRMat< complex<double> > &a)
 {
+#ifdef CUDALA
+        if(location == cpu){
+#endif
+
 	NRMat<double> result(a.nrows(), a.ncols());
 	cblas_dcopy(a.nrows()*a.ncols(), (const double *)a[0]+1, 2, result, 1);
+#ifdef CUDALA
+        }else{
+        laerror("not implemented for cuda yet");
+        }
+#endif
 	return result;
 }

-const NRMat< complex<double> > realmatrix (const NRMat<double> &a)
+template<>
+const NRMat< complex<double> > realmatrix<NRMat<double> > (const NRMat<double> &a)
 {
+#ifdef CUDALA
+        if(location == cpu){
+#endif
+
+
 	NRMat <complex<double> > result(a.nrows(), a.ncols());
 	cblas_dcopy(a.nrows()*a.ncols(), a, 1, (double *)result[0], 2);
+#ifdef CUDALA
+        }else{
+        laerror("not implemented for cuda yet");
+        }
+#endif
+
 	return result;
 }

-const NRMat< complex<double> > imagmatrix (const NRMat<double> &a)
+template<>
+const NRMat< complex<double> > imagmatrix<NRMat<double> > (const NRMat<double> &a)
 {
+#ifdef CUDALA
+        if(location == cpu){
+#endif
+
 	NRMat< complex<double> > result(a.nrows(), a.ncols());
 	cblas_dcopy(a.nrows()*a.ncols(), a, 1, (double *)result[0]+1, 2);
+#ifdef CUDALA
+        }else{
+        laerror("not implemented for cuda yet");
+        }
+#endif
+
 	return result;
 }

-const NRMat< complex<double> > complexmatrix (const NRMat<double> &re, const NRMat<double> &im)
+template<>
+const NRMat< complex<double> > complexmatrix<NRMat<double> > (const NRMat<double> &re, const NRMat<double> &im)
 {
 	if(re.nrows()!=im.nrows() || re.ncols() != im.ncols()) laerror("incompatible sizes of real and imaginary parts");
        NRMat< complex<double> > result(re.nrows(), re.ncols());
@ -1080,57 +1303,60 @@ const NRMat< complex<double> > complexmatrix (const NRMat<double> &re, const NRM
        return result;
 }

+template<>
+const SparseSMat< complex<double> > complexmatrix<SparseSMat<double> >(const SparseSMat<double> &re, const SparseSMat<double> &im) {
+  if(re.nrows()!=im.nrows() || re.ncols() != im.ncols()) laerror("incompatible sizes of real and imaginary parts");
+  SparseSMat< complex<double> > result(re.nrows(),re.ncols());
+  complex<double> tmp;

+  SparseSMat<double>::iterator pre(re);
+  for(; pre.notend(); ++pre) {
+    tmp = pre->elem;
+    result.add(pre->row,pre->col,tmp,false);
+  }

-NRMat<double> matrixfunction(NRMat<double> a, complex<double>
-		(*f)(const complex<double> &), const bool adjust)
-{
-	int n = a.nrows();
-	NRMat< complex<double> > u(n, n), v(n, n);
-	NRVec< complex<double> > w(n);
-/*
-NRMat<complex<double> > a0=complexify(a);
-*/
-	gdiagonalize(a, w, &u, &v);//a gets destroyed, eigenvectors are rows
-	NRVec< complex<double> > z = diagofproduct(u, v, 1, 1);
-/*
-std::cout <<"TEST matrixfunction\n"<<w<<u<<v<<z;
-std::cout <<"TEST matrixfunction1 "<< u*a0 - diagonalmatrix(w)*u<<std::endl;
-std::cout <<"TEST matrixfunction2 "<< a0*v.transpose(1) - v.transpose(1)*diagonalmatrix(w)<<std::endl;
-std::cout <<"TEST matrixfunction3 "<< u*v.transpose(1)<<diagonalmatrix(z)<<std::endl;
-NRVec< complex<double> > wz(n);
-for (int i=0; i<a.nrows(); i++) wz[i] = w[i]/z[i];
-std::cout <<"TEST matrixfunction4 "<< a0<< v.transpose(true)*diagonalmatrix(wz)*u<<std::endl;
-*/
+  SparseSMat<double>::iterator pim(im);
+  for(; pim.notend(); ++pim) {
+    tmp = complex<double>(0,1)*(pim->elem);
+    result.add(pim->row,pim->col,tmp,false);
+  }

-	for (int i=0; i<a.nrows(); i++) w[i] = (*f)(w[i])/z[i];
-	u.diagmultl(w);
-
-	NRMat< complex<double> > r(n, n);
-	r.gemm(0.0, v, 'c', u, 'n', 1.0);
-	double inorm = cblas_dnrm2(n*n, (double *)r[0]+1, 2);
-	if (inorm > 1e-10) {
-		std::cout << "norm = " << inorm << std::endl;
-		laerror("nonzero norm of imaginary part of real matrixfunction");
-	}
-	return realpart(r);
+  return result;
 }

-NRMat<double> matrixfunction(NRSMat<double> a, double (*f) (double))
-{
-	int n = a.nrows();
-	NRVec<double> w(n);
-	NRMat<double> v(n, n);
-	diagonalize(a, w, &v, 0);
+template<>
+const SparseSMat< complex<double> > realmatrix<SparseSMat<double> >(const SparseSMat<double> &re) {
+  SparseSMat< complex<double> > result(re.nrows(),re.ncols());
+  complex<double> tmp;

-	for (int i=0; i<a.nrows(); i++) w[i] = (*f)(w[i]);
-	NRMat<double> u = v;
-	v.diagmultl(w);
-	NRMat<double> r(n, n);
-	r.gemm(0.0, u, 't', v, 'n', 1.0);
-	return r;
+  SparseSMat<double>::iterator pre(re);
+  for(; pre.notend(); ++pre) {
+    tmp = pre->elem;
+    result.add(pre->row,pre->col,tmp,false);
+  }
+
+  return result;
 }

+template<>
+const SparseSMat< complex<double> > imagmatrix<SparseSMat<double> >(const SparseSMat<double> &im) {
+  SparseSMat< complex<double> > result(im.nrows(),im.ncols());
+  complex<double> tmp;
+
+
+  SparseSMat<double>::iterator pim(im);
+  for(; pim.notend(); ++pim) {
+    tmp = complex<double>(0,1)*(pim->elem);
+    result.add(pim->row,pim->col,tmp,false);
+  }
+
+  return result;
+}
+
+
+
+
+
 NRMat<double> realmatrixfunction(NRMat<double> a, double (*f) (const double))
 {
        int n = a.nrows();
@ -1145,6 +1371,7 @@ NRMat<double> realmatrixfunction(NRMat<double> a, double (*f) (const double))
        return r;
 }

+
 NRMat<complex<double> > complexmatrixfunction(NRMat<double> a, double (*fre) (const double), double (*fim) (const double))
 {
        int n = a.nrows();
@ -1169,6 +1396,16 @@ NRMat<complex<double> > complexmatrixfunction(NRMat<double> a, double (*fre) (co


 // instantize template to an addresable function
+complex<double> myccopy (const complex<double> &x) 
+{
+	return x;
+}
+
+double mycopy (const double x) 
+{
+	return x;
+}
+
 complex<double> myclog (const complex<double> &x) 
 {
 	return log(x);
@ -1193,14 +1430,37 @@ double sqrtinv (const double x)

 NRMat<double>  log(const NRMat<double> &a)
 {
-	return matrixfunction(a, &myclog, 1);
+	return matrixfunction(a, &myclog);
 }

+NRMat<complex<double> >  log(const NRMat<complex<double> > &a)
+{
+        return matrixfunction(a, &myclog);
+}
+
+
 NRMat<double>  exp0(const NRMat<double> &a)
 {
-        return matrixfunction(a, &mycexp, 1);
+        return matrixfunction(a, &mycexp);
 }

+NRMat<complex<double> >  exp0(const NRMat<complex<double> > &a)
+{
+        return matrixfunction(a, &mycexp);
+}
+
+NRMat<complex<double> >  copytest(const NRMat<complex<double> > &a)
+{
+        return matrixfunction(a, &myccopy);
+}
+
+NRMat<double>  copytest(const NRMat<double> &a)
+{
+        return matrixfunction(a, &myccopy);
+}
+
+
+


 const NRVec<double> diagofproduct(const NRMat<double> &a, const NRMat<double> &b,
--- a/nonclass.h
+++ b/nonclass.h
@ -88,8 +88,8 @@ extern const  NRVec<T> diagofproduct(const NRMat<T> &a, const NRMat<T> &b,\
 extern T trace2(const NRMat<T> &a, const NRMat<T> &b, bool trb=0); \
 extern T trace2(const NRSMat<T> &a, const NRSMat<T> &b, const bool diagscaled=0);\
 extern T trace2(const NRSMat<T> &a, const NRMat<T> &b, const bool diagscaled=0);\
-extern void linear_solve(NRMat<T> &a, NRMat<T> *b, double *det=0,int n=0); /*solve Ax^T=b^T (b is nrhs x n) */ \
-extern void linear_solve(NRSMat<T> &a, NRMat<T> *b, double *det=0, int n=0); /*solve Ax^T=b^T (b is nrhs x n) */\
+extern void linear_solve(NRMat<T> &a, NRMat<T> *b, T *det=0,int n=0); /*solve Ax^T=b^T (b is nrhs x n) */ \
+extern void linear_solve(NRSMat<T> &a, NRMat<T> *b, T *det=0, int n=0); /*solve Ax^T=b^T (b is nrhs x n) */\
 extern void linear_solve(NRMat<T> &a, NRVec<T> &b, double *det=0, int n=0); \
 extern void linear_solve(NRSMat<T> &a, NRVec<T> &b, double *det=0, int n=0); \
 extern void diagonalize(NRMat<T> &a, NRVec<LA_traits<T>::normtype> &w, const bool eivec=1, const bool corder=1, int n=0, NRMat<T> *b=NULL, const int itype=1); \
@ -104,36 +104,28 @@ declare_la(complex<double>)

 // Separate declarations
 //general nonsymmetric matrix and generalized diagonalization
+//corder =0 ... C rows are eigenvectors, =1 ... C columns are eigenvectors
 extern void gdiagonalize(NRMat<double> &a, NRVec<double> &wr, NRVec<double> &wi,
 		NRMat<double> *vl, NRMat<double> *vr, const bool corder=1, int n=0, const int sorttype=0, const int biorthonormalize=0,
-		NRMat<double> *b=NULL, NRVec<double> *beta=NULL);
-extern void gdiagonalize(NRMat<double> &a, NRVec< complex<double> > &w,
+		NRMat<double> *b=NULL, NRVec<double> *beta=NULL); //this used real storage of eigenvectors like dgeev
+
+template<typename T>
+extern void gdiagonalize(NRMat<T> &a, NRVec< complex<double> > &w,
 		 NRMat< complex<double> >*vl, NRMat< complex<double> > *vr,
 		 const bool corder=1, int n=0, const int sorttype=0, const int biorthonormalize=0,
-		NRMat<double> *b=NULL, NRVec<double> *beta=NULL);
-extern NRMat<double> matrixfunction(NRSMat<double> a, double (*f) (double));
-extern NRMat<double> realmatrixfunction(NRMat<double> a, double (*f) (double)); //a has to by in fact symmetric
-extern NRMat<complex<double> > complexmatrixfunction(NRMat<double> a, double (*fre) (double), double (*fim) (double)); //a has to by in fact symmetric
-extern NRMat<double> matrixfunction(NRMat<double> a, complex<double> (*f)(const complex<double> &),const bool adjust=0);
+		NRMat<T> *b=NULL, NRVec<T> *beta=NULL); //eigenvectors are stored in complex matrices for T both double and complex

-extern complex<double> sqrtinv(const complex<double> &);
-extern double sqrtinv(const double);
-
-//functions on matrices
-inline NRMat<double>  sqrt(const NRSMat<double> &a) { return matrixfunction(a,&std::sqrt); }
-inline NRMat<double>  sqrtinv(const NRSMat<double> &a) { return matrixfunction(a,&sqrtinv); }
-inline NRMat<double>  realsqrt(const NRMat<double> &a) { return realmatrixfunction(a,&std::sqrt); }
-inline NRMat<double>  realsqrtinv(const NRMat<double> &a) { return realmatrixfunction(a,&sqrtinv); }
-inline NRMat<double>  log(const NRSMat<double> &a) { return matrixfunction(a,&std::log); }
-extern NRMat<double> log(const NRMat<double> &a);
-extern NRMat<double> exp0(const NRMat<double> &a);
-
-
-extern const NRMat<double> realpart(const NRMat< complex<double> >&);
-extern const NRMat<double> imagpart(const NRMat< complex<double> >&);
-extern const NRMat< complex<double> > realmatrix (const NRMat<double>&);
-extern const NRMat< complex<double> > imagmatrix (const NRMat<double>&);
-extern const NRMat< complex<double> > complexmatrix (const NRMat<double>&, const NRMat<double>&);
+//complex,real,imaginary parts of various entities
+template<typename T>
+extern const typename LA_traits<T>::realtype realpart(const T&);
+template<typename T>
+extern const typename LA_traits<T>::realtype imagpart(const T&);
+template<typename T>
+extern const typename LA_traits<T>::complextype realmatrix (const T&);
+template<typename T>
+extern const typename LA_traits<T>::complextype imagmatrix (const T&);
+template<typename T>
+extern const typename LA_traits<T>::complextype complexmatrix (const T&, const T&);

 //Cholesky decomposition
 extern void cholesky(NRMat<double> &a, bool upper=1);
@ -315,5 +307,84 @@ return r;
 }


+//matrix functions via diagonalization
+
+extern NRMat<double> realmatrixfunction(NRMat<double> a, double (*f) (double)); //a has to by in fact symmetric
+extern NRMat<complex<double> > complexmatrixfunction(NRMat<double> a, double (*fre) (double), double (*fim) (double)); //a has to by in fact symmetric
+
+template<typename T> 
+NRMat<T> matrixfunction(NRSMat<T> a, double (*f) (double)) //of symmetric/hermitian matrix
+{
+	int n = a.nrows();
+	NRVec<double> w(n);
+	NRMat<T> v(n, n);
+	diagonalize(a, w, &v, 0);
+
+	for (int i=0; i<a.nrows(); i++) w[i] = (*f)(w[i]);
+	NRMat<T> u = v;
+	NRVec<T> ww=w; //diagmultl needs same type
+	v.diagmultl(ww);
+	NRMat<T> r(n, n);
+	r.gemm(0.0, u, 't', v, 'n', 1.0); //gemm will use 'c' for complex ones 
+	return r;
+}
+
+
+template<typename T>
+extern NRMat<T> matrixfunction(NRMat<T> a, complex<double> (*f)(const complex<double> &)) //of a general real/complex matrix
+{
+	int n = a.nrows();
+	NRVec<complex<double> > w(n);
+	NRMat<complex<double> > u(n,n),v(n,n);
+
+#ifdef debugmf
+NRMat<complex<double> > a0=a;
+#endif
+
+        gdiagonalize<T>(a, w, &u, &v, false,n,0,false,NULL,NULL);//a gets destroyed, eigenvectors are rows
+        NRVec< complex<double> > z = diagofproduct(u, v, 1, 1);
+
+#ifdef debugmf
+std::cout <<"TEST matrixfunction\n"<<w<<u<<v<<z;
+std::cout <<"TEST matrixfunction1 "<< u*a0 - diagonalmatrix(w)*u<<std::endl;
+std::cout <<"TEST matrixfunction2 "<< a0*v.transpose(1) - v.transpose(1)*diagonalmatrix(w)<<std::endl;
+std::cout <<"TEST matrixfunction3 "<< u*v.transpose(1)<<diagonalmatrix(z)<<std::endl;
+#endif
+
+NRVec< complex<double> > wz(n);
+for (int i=0; i<a.nrows(); i++) wz[i] = w[i]/z[i];
+
+#ifdef debugmf
+std::cout <<"TEST matrixfunction4 "<< a0<< v.transpose(true)*diagonalmatrix(wz)*u<<std::endl;
+#endif
+
+        for (int i=0; i<a.nrows(); i++) w[i] = (*f)(w[i])/z[i];
+        u.diagmultl(w);
+
+        NRMat< complex<double> > r(n, n);
+        r.gemm(0.0, v, 'c', u, 'n', 1.0);
+	return (NRMat<T>) r; //convert back to real if applicable by the explicit decomplexifying constructor; it is NOT checked to which accuracy the imaginary part is actually zero
+}
+
+
+
+
+extern complex<double> sqrtinv(const complex<double> &);
+extern double sqrtinv(const double);
+
+//functions on matrices
+inline NRMat<double>  sqrt(const NRSMat<double> &a) { return matrixfunction(a,&std::sqrt); }
+inline NRMat<double>  sqrtinv(const NRSMat<double> &a) { return matrixfunction(a,&sqrtinv); }
+inline NRMat<double>  realsqrt(const NRMat<double> &a) { return realmatrixfunction(a,&std::sqrt); }
+inline NRMat<double>  realsqrtinv(const NRMat<double> &a) { return realmatrixfunction(a,&sqrtinv); }
+inline NRMat<double>  log(const NRSMat<double> &a) { return matrixfunction(a,&std::log); }
+extern NRMat<double> log(const NRMat<double> &a);
+extern NRMat<complex<double> > log(const NRMat<complex<double> > &a);
+extern NRMat<complex<double> > exp0(const NRMat<complex<double> > &a);
+extern NRMat<complex<double> > copytest(const NRMat<complex<double> > &a);
+extern NRMat<double> copytest(const NRMat<double> &a);
+extern NRMat<double> exp0(const NRMat<double> &a);
+
+
 }//namespace
 #endif
--- a/smat.cc
+++ b/smat.cc
@ -58,7 +58,7 @@ void NRSMat<T>::put(int fd, bool dim, bool transp) const {
 		if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write");
 		if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write");
 	}
-	LA_traits<T>::multiput(NN2,fd,v,dim);
+	LA_traits<T>::multiput((size_t)nn*(nn+1)/2,fd,v,dim);
 }

 /***************************************************************************//**
@ -89,7 +89,7 @@ void NRSMat<T>::get(int fd, bool dim, bool transp) {
 	}else{
 		copyonwrite();
 	}
-	LA_traits<T>::multiget(NN2,fd,v,dim);
+	LA_traits<T>::multiget((size_t)nn*(nn+1)/2,fd,v,dim);
 }


--- a/smat.h
+++ b/smat.h
@ -159,6 +159,7 @@ public:

 	void clear() {copyonwrite(); LA_traits<T>::clear(v,NN2);}; //zero out
 	void resize(const int n);
+	void dealloc(void) {resize(0);}

 	inline operator T*();
 	inline operator const T*() const;
--- a/sparsemat.cc
+++ b/sparsemat.cc
@ -1245,6 +1245,9 @@ return *this;



+/*
+ Commented out by Roman for ICC
+
 #define INSTANTIZE(T) \
 template SparseMat<T> & SparseMat<T>::oplusequal(const SparseMat<T> &rhs);\
 template SparseMat<T> & SparseMat<T>::oplusequal(const NRMat<T> &rhs);\
@ -1291,9 +1294,8 @@ template void SparseMat<T>::permuteindices(const NRVec<SPMatindex> &p);\


 INSTANTIZE(double)
-
 INSTANTIZE(complex<double>) //some functions are not OK for hermitean matrices, needs a revision!!!
-
+*/

 //////////////////////////////////////////////////////////////////////////////
 //// forced instantization in the corresponding object file
--- a/sparsemat.h
+++ b/sparsemat.h
@ -120,6 +120,7 @@ public:
 	void get(int fd, bool dimensions=1, bool transposed=false);
 	void put(int fd, bool dimensions=1, bool transposed=false) const;
 	void resize(const SPMatindex n, const SPMatindex m); //destructive
+        void dealloc(void) {resize(0,0);}
 	void incsize(const SPMatindex n, const SPMatindex m); //increase size without destroying the data
 	void transposeme();
 	const SparseMat transpose() const;
--- a/sparsesmat.cc
+++ b/sparsesmat.cc
@ -257,6 +257,36 @@ if(divide)
 return divide?NULL:&r[0];
 }

+template <class T>
+SparseSMat<T> SparseSMat<T>::submatrix(const int fromrow, const int torow, const int fromcol, const int tocol) const
+{
+#ifdef DEBUG
+        if(fromrow<0 || fromrow>=nn|| torow<0 || torow>=nn || fromcol<0 || fromcol>=mm || tocol<0 || tocol>=mm || fromrow>torow || fromcol>tocol){
+                laerror("invalid submatrix specification");
+        }
+#endif
+        const int m = tocol - fromcol + 1;
+	const int n = torow - fromrow + 1;
+        SparseSMat<T> result(n, m);
+	typename SparseSMat<T>::iterator p(*this);
+	for(; p.notend(); ++p) 
+	    if(p->row>=fromrow && p->row<= torow && p->col >= fromcol && p->col <= tocol)
+		result.add(p->row-fromrow, p->col-fromcol, p->elem, false);
+
+return result;
+}
+
+template <class T>
+void SparseSMat<T>::storesubmatrix(const int fromrow, const int fromcol, const SparseSMat<T> &rhs)
+{
+        const int tocol = fromcol + rhs.ncols() - 1;
+        const int torow = fromrow + rhs.nrows() - 1;
+#ifdef DEBUG
+        if(fromrow<0 || fromrow>=nn || torow>=nn || fromcol<0 || fromcol>=mm || tocol>=mm) laerror("bad indices in storesubmatrix");
+#endif
+	typename SparseSMat<T>::iterator p(rhs);
+	for(; p.notend(); ++p) add(p->row+fromrow, p->col+fromcol, p->elem, false);
+}


 template <class T>
@ -305,6 +335,7 @@ void SparseSMat<T>::put(int fd, bool dimen, bool transp) const {



+/* Commented out by Roman for ICC

 #define INSTANTIZE(T) \
 template void SparseSMat<T>::gemm(const T beta, const SparseSMat &a, const char transa, const SparseSMat &b, const char transb, const T alpha); \
@ -321,8 +352,8 @@ template void SparseSMat<T>::put(int fd, bool dimen, bool transp) const; \


 INSTANTIZE(double)
-
 INSTANTIZE(complex<double>) 
+*/

 //// forced instantization of functions in the header in the corresponding object file
 template class SparseSMat<double>;
--- a/sparsesmat.h
+++ b/sparsesmat.h
@ -61,9 +61,11 @@ public:
 	explicit SparseSMat(const SparseMat<T> &rhs);
 	explicit SparseSMat(const NRSMat<T> &rhs);
 	explicit SparseSMat(const NRMat<T> &rhs);
+	explicit SparseSMat(const CSRMat<T> &rhs);
 	SparseSMat & operator=(const SparseSMat &rhs);
 	void copyonwrite();
        void resize(const SPMatindex nn, const SPMatindex mm);
+   	void dealloc(void) {resize(0,0);}
 	inline void setcoldim(int i) {mm=(SPMatindex)i;};
 	//std::map<SPMatindex,T> *line(SPMatindex n) const {return v[n];};
 	typedef std::map<SPMatindex,T> *ROWTYPE;
@ -100,6 +102,8 @@ public:
 	int nrows() const {return nn;}
 	int ncols() const {return mm;}
 	SparseSMat<T>  cholesky(void) const;
+	SparseSMat submatrix(const int fromrow, const int torow, const int fromcol, const int tocol) const;
+	void storesubmatrix(const int fromrow, const int fromcol, const SparseSMat &rhs);

 	class iterator {//not efficient, just for output to ostreams
        private:
--- a/t.cc
+++ b/t.cc
@ -62,9 +62,10 @@ NRVec<double> x(1.,10);
 NRVec<double> y(2.,10);
 NRVec<double> z(-2.,10);

-cout.setf(ios::scientific);
-//cc:cout.setf(ios::fixed);
-cout.precision(12);
+//cout.setf(ios::scientific);
+cc:cout.setf(ios::fixed);
+cout.precision(10);
+cin.exceptions ( ifstream::eofbit | ifstream::failbit | ifstream::badbit );


 if(0) test(x);
@ -415,6 +416,50 @@ cout <<v.transpose(1)*u;
 }


+if(0)
+{
+NRMat<double> a;
+cin >>a;
+int n=a.nrows();
+NRMat<complex<double> > u(n,n),v(n,n);
+NRVec<complex<double> >w(n);
+gdiagonalize(a,w,&u,&v,0,n,0,1);
+cout <<u;
+cout <<w;
+cout <<v;
+
+NRVec<complex<double> >z=diagofproduct(u,v,1,1);
+cout <<z;
+for(int i=0;i<a.nrows();++i) w[i]/=z[i];//account for normalization of eigenvectors
+cout <<u*v.transpose(1); //check biorthonormality
+u.diagmultl(w);
+cout <<v.transpose(1)*u;
+
+}
+
+
+if(1)
+{
+NRMat<complex<double> > a;
+cin >>a;
+int n=a.nrows();
+NRMat<complex<double> > u(n,n),v(n,n);
+NRVec<complex<double> >w(n);
+gdiagonalize(a,w,&u,&v,0,n,0,1);
+cout <<u;
+cout <<w;
+cout <<v;
+
+NRVec<complex<double> >z=diagofproduct(u,v,1,1);
+cout <<z;
+for(int i=0;i<a.nrows();++i) w[i]/=z[i];//account for normalization of eigenvectors
+cout <<u*v.transpose(1); //check biorthonormality
+u.diagmultl(w);
+cout <<v.transpose(1)*u;
+
+}
+
+

 if(0)
 {
@ -773,8 +818,42 @@ for(int i=1; i<4;i++) b=b*b;

 if(0)
 {
-NRMat<double> a;
+NRMat<double> aa,bb,cc;
+cin >>aa;
+cc=copytest(aa);
+cout <<cc;
+
+NRMat<complex<double> > a,b,c;
+a=complexify(aa);
+c=copytest(a);
+cout <<c;
+b=log(a);
+cout <<b;
+cout <<exp(b);
+}
+
+if(0)
+{
+NRMat<complex<double> > a,b,c;
+cin>>a;
+c=copytest(a);
+cout <<c;
+b=log(a);
+cout <<b;
+cout <<exp(b);
+}
+
+if(0)
+{
+NRMat<double> a,b,c;
 cin >>a;
+c=copytest(a);
+cout <<c;
+}
+
+if(0)
+{
+NRMat<double> a;
 NRMat<double> b=exp(a);
 NRMat<double> c=log(b);
 cout <<a;
@ -1163,6 +1242,7 @@ cout <<a.oplus(b);
 cout <<a.otimes(b);
 }

+
 //test of general square matrix eigenvector derivatives
 if(0)
 {
@ -1236,21 +1316,28 @@ tmp.diagonalof(wrg);
 for(int i=0; i<n; ++i)
        for(int j=0; j<n; ++j)
 		if(i!=j) tmp(i,j) /= (wr[j] - wr[i]); else  tmp(i,j) = 0.;
-cout <<" old X matrix \n"<<tmp<<endl;
+cout <<" old X matrix (tmp) \n"<<tmp<<endl;

 NRMat<double> Y = tmp;
 NRMat<double> S = vr.transpose() * vr;
+cout <<"test S\n"<<S;
 NRMat<double> tmp2 = S * tmp;
+cout <<"test tmp2\n"<<tmp2;
+Y.copyonwrite();
 for(int i=0; i<n; ++i) Y(i,i) -= tmp2(i,i);

 cout <<"Y matrix \n"<< Y;

-NRMat<double> numX = inverse(vr) * vrd;
+NRMat<double> vri = inverse(vr);
+
+NRMat<double> numX = vri * vrd;
 cout <<" numerical X matrix \n"<< numX;
 cout <<" numerical X matrix test = "<< (vr * numX - vrd).norm()<<endl;

 vrg = vr * Y;

+vlg = - (Y*vri).transpose();
+
 //and compare
 cout <<"eigenvalue numerical derivative\n"<<wrd<<endl;
 cout <<"eigenvalue analytic derivative\n"<<wrg<<endl;
@ -1261,9 +1348,17 @@ cout <<"right eigenvector  numerical derivative\n"<<vrd<<endl;
 cout <<"right eigenvector  analytic derivative\n"<<vrg<<endl;
 cout <<"right eigenvector derivative error = "<<(vrd-vrg).norm()<<endl;

+//and for left eigenvectors
+cout <<"left eigenvector  numerical derivative\n"<<vld<<endl;
+cout <<"left eigenvector  analytic derivative\n"<<vlg<<endl;
+cout <<"left eigenvector derivative error = "<<(vld-vlg).norm()<<endl;
+

 }

+//@@@@@@@make this derivative check in complex version
+
+
 if(0)
 {
 try { laerror("test catch exception"); }
@ -1415,7 +1510,9 @@ cout <<"symmetry error "<<(het-he).norm()<<endl;

 if(0)
 {
-NRSMat<double> hd(100);
+int n;
+cin >>n;
+NRSMat<double> hd(n);
 hd.randomize(1);
 SparseSMat<double> h(hd);
 NRMat<double> rd = hd*hd;
@ -1425,14 +1522,22 @@ NRMat<double> r2(rx);
 cout <<"Error = "<<(r2-rd).norm()<<endl;
 }

+
 if(0)
 {
-SparseSMat<complex<double> > h;
-cin>>h;
-h *= complex<double>(0,1);
+SparseSMat<double> h0;
+cin>>h0;
+cout <<"matrix read\n"; cout.flush();
+SparseSMat<double> h1 = h0; //.submatrix(0,2047,0,2047);
+SparseSMat<complex<double> > h = imagmatrix(h1);
 double t=clock()/((double) (CLOCKS_PER_SEC));
-SparseSMat<complex<double> > r = exp(h);
-cout <<"SparseSMat time "<<clock()/((double) (CLOCKS_PER_SEC))-t <<"\n";
+SparseSMat<complex<double> > r = h*h;
+cout <<"SparseSMat mult time "<<clock()/((double) (CLOCKS_PER_SEC))-t <<"\n";
+cout.flush();
+t=clock()/((double) (CLOCKS_PER_SEC));
+r = exp(h);
+cout <<"SparseSMat exp time "<<clock()/((double) (CLOCKS_PER_SEC))-t <<"\n";
+cout.flush();
 if(h.nrows()<=1024)
 {
 NRSMat<complex<double> > h3(h);
@ -1443,6 +1548,7 @@ cout <<"errorx = "<<(r2-NRSMat<complex<double> >(r)).norm()<<endl;
 }
 }

+
 if(0)
 {
 int n;
@ -1598,7 +1704,7 @@ cgpu.moveto(cpu);
 cout << "Error = "<<(c-cgpu).norm()<<endl;
 }

-if(1)
+if(0)
 {
 int n;
 cin >>n;
@ -1627,6 +1733,31 @@ c.moveto(gpu1);
 cout << "Error = "<<(c-cgpu).norm()<<endl;
 }

+/*
+if(0)
+{
+CSRMat<double> h0;
+cin>>h0;
+cout <<"matrix read\n"; cout.flush();
+CSRMat<double> h1 = h0; 
+CSRMat<complex<double> > h = imagmatrix(h1);
+double t=clock()/((double) (CLOCKS_PER_SEC));
+CSRMat<complex<double> > r = h*h;
+cout <<"CSRMat mult time "<<clock()/((double) (CLOCKS_PER_SEC))-t <<"\n";
+cout.flush();
+t=clock()/((double) (CLOCKS_PER_SEC));
+r = exp(h);
+cout <<"CSRMat exp time "<<clock()/((double) (CLOCKS_PER_SEC))-t <<"\n";
+cout.flush();
+if(h.nrows()<=1024)
+{
+NRMat<complex<double> > h2(h);
+NRMat<complex<double> >r2 = exp(h2);
+cout <<"error = "<<(r2-NRMat<complex<double> >(r)).norm()<<endl;
+}
+}
+*/
+



--- a/vec.cc
+++ b/vec.cc
@ -793,14 +793,14 @@ NRVec<complex<double> >::otimes(const NRVec<complex<double> > &b, const bool con
 		if(conj){
 			const cuDoubleComplex alpha = make_cuDoubleComplex(scale.real(), -scale.imag());

-			cublasZgerc(b.nn, nn, alpha, (cuDoubleComplex*)(b.v), 1, (cuDoubleComplex*)(v), 1, (cuDoubleComplex*)(result[0]), 1);
+			cublasZgerc(b.nn, nn, alpha, (cuDoubleComplex*)(b.v), 1, (cuDoubleComplex*)(v), 1, (cuDoubleComplex*)(result[0]), b.nn);
 			TEST_CUBLAS("cublasZgerc");

 			result.conjugateme();
 		}else{
 			const cuDoubleComplex alpha = make_cuDoubleComplex(scale.real(), +scale.imag());

-			cublasZgeru(b.nn, nn, alpha, (cuDoubleComplex*)(b.v), 1, (cuDoubleComplex*)(v), 1, (cuDoubleComplex*)(result[0]), 1);
+			cublasZgeru(b.nn, nn, alpha, (cuDoubleComplex*)(b.v), 1, (cuDoubleComplex*)(v), 1, (cuDoubleComplex*)(result[0]), b.nn);
 			TEST_CUBLAS("cublasZgeru");
 		}
 	}
@ -839,6 +839,9 @@ NRVec<complex<double> > complexify(const NRVec<double> &rhs) {
 /***************************************************************************//**
 * forced instantization in the corespoding object file
 ******************************************************************************/
+/*
+ Commented out by Roman for ICC
+
 #define INSTANTIZE(T) \
 template void NRVec<T>::put(int fd, bool dim, bool transp) const; \
 template void NRVec<T>::get(int fd, bool dim, bool transp); \
@ -855,19 +858,7 @@ INSTANTIZE(unsigned short)
 INSTANTIZE(unsigned int)
 INSTANTIZE(unsigned long)
 INSTANTIZE(unsigned long long)
-
-template class NRVec<double>;
-template class NRVec<complex<double> >;
-template class NRVec<char>;
-template class NRVec<short>;
-template class NRVec<int>;
-template class NRVec<long>;
-template class NRVec<long long>;
-template class NRVec<unsigned char>;
-template class NRVec<unsigned short>;
-template class NRVec<unsigned int>;
-template class NRVec<unsigned long>;
-template class NRVec<unsigned long long>;
+*/

 #define INSTANTIZE_DUMMY(T) \
 template<> void NRVec<T>::gemv(const T beta, const NRMat<T> &a, const char trans,  const T alpha, const NRVec<T> &x) { laerror("gemv on unsupported types"); } \
@ -878,6 +869,11 @@ template<> void NRVec<T>::gemv(const  LA_traits_complex<T>::Component_type beta,
 template<> NRVec<T> & NRVec<T>::normalize(LA_traits<T>::normtype *) {laerror("normalize() impossible for integer types"); return *this;} \
 template<> const NRMat<T> NRVec<T>::otimes(const NRVec<T> &b,const bool conj, const T &scale) const {laerror("otimes presently implemented only for double and complex double"); return NRMat<T> ();}

+// Roman
+// following gemv are not implemented
+template<> void NRVec<double>::gemv(const double beta, const SparseMat<double> &a, const char trans,  const double alpha, const NRVec<double> &x, bool s) { laerror("gemv on unsupported types"); } 
+template<> void NRVec< complex<double> >::gemv(const complex<double> beta, const SparseMat< complex<double> > &a, const char trans,  const complex<double> alpha, const NRVec< complex<double> > &x, bool s) { laerror("gemv on unsupported types"); } 
+

 INSTANTIZE_DUMMY(char)
 INSTANTIZE_DUMMY(short)
@ -902,4 +898,17 @@ INSTANTIZE_DUMMY(complex<unsigned long long>)
 INSTANTIZE_DUMMY(complex<complex<double> >)
 INSTANTIZE_DUMMY(complex<complex<float> >)

+template class NRVec<double>;
+template class NRVec<complex<double> >;
+template class NRVec<char>;
+template class NRVec<short>;
+template class NRVec<int>;
+template class NRVec<long>;
+template class NRVec<long long>;
+template class NRVec<unsigned char>;
+template class NRVec<unsigned short>;
+template class NRVec<unsigned int>;
+template class NRVec<unsigned long>;
+template class NRVec<unsigned long long>;
+
 }//namespace
--- a/vec.h
+++ b/vec.h
@ -287,6 +287,9 @@ public:
 	//! resize the current vector
 	void resize(const int n);

+	//!deallocate the current vector
+	void dealloc(void) {resize(0);}
+
 	//! determine the norm of this vector 
 	inline const typename LA_traits<T>::normtype norm() const;