diff --git a/fourindex.h b/fourindex.h
index 3363a5a..f80359a 100644
--- a/fourindex.h
+++ b/fourindex.h
@@ -2,6 +2,7 @@
 #define _fourindex_included
 
 //element of a linked list, indices in a portable way, no bit shifts and endianity problems any more!
+//note: nn is never compared with individual indices, so indexing from 1 as well as from 0 is possible
 
 
 template<class I, class T>
@@ -252,7 +253,7 @@ istream& operator>>(istream  &s, fourindex<I,T> &x)
                         {
 			s>>elem;
 			x.add(i,j,k,l,elem);
-			s >> i >> j >>k >>ll;
+			s >> i >> j >>k >>l;
                         }
                 return s;
                 }
diff --git a/la_traits.h b/la_traits.h
index a14f487..6191724 100644
--- a/la_traits.h
+++ b/la_traits.h
@@ -87,8 +87,8 @@ static bool bigger(const  complex<C> &x, const complex<C> &y) {laerror("complex
 static bool smaller(const  complex<C> &x, const complex<C> &y) {laerror("complex comparison undefined"); return false;}
 static inline normtype norm (const  complex<C> &x) {return abs(x);}
 static inline void axpy (complex<C> &s, const complex<C> &x, const complex<C> &c) {s+=x*c;}
-static void get(int fd, complex<C> &x, bool dimensions=0) {if(sizeof(complex<C>)!=read(fd,&x,sizeof(complex<C>))) laerror("read error");}
-static void put(int fd, const complex<C> &x, bool dimensions=0) {if(sizeof(complex<C>)!=write(fd,&x,sizeof(complex<C>))) laerror("write error");}
+static inline void get(int fd, complex<C> &x, bool dimensions=0, bool transp=0) {if(sizeof(complex<C>)!=read(fd,&x,sizeof(complex<C>))) laerror("read error");}
+static inline void put(int fd, const complex<C> &x, bool dimensions=0, bool transp=0) {if(sizeof(complex<C>)!=write(fd,&x,sizeof(complex<C>))) laerror("write error");}
 static void multiget(unsigned int n,int fd, complex<C> *x, bool dimensions=0){if((ssize_t)(n*sizeof(complex<C>))!=read(fd,x,n*sizeof(complex<C>))) laerror("read error");}
 static void multiput(unsigned int n, int fd, const complex<C> *x, bool dimensions=0) {if((ssize_t)(n*sizeof(complex<C>))!=write(fd,x,n*sizeof(complex<C>))) laerror("write error");}
 
@@ -105,8 +105,8 @@ static inline bool bigger(const  C &x, const C &y) {return x>y;}
 static inline bool smaller(const  C &x, const C &y) {return x<y;}
 static inline normtype norm (const  C &x) {return abs(x);}
 static inline void axpy (C &s, const C &x, const C &c) {s+=x*c;}
-static void put(int fd, const C &x, bool dimensions=0) {if(sizeof(C)!=write(fd,&x,sizeof(C))) laerror("write error");}
-static void get(int fd, C &x, bool dimensions=0) {if(sizeof(C)!=read(fd,&x,sizeof(C))) laerror("read error");}
+static inline void put(int fd, const C &x, bool dimensions=0, bool transp=0) {if(sizeof(C)!=write(fd,&x,sizeof(C))) laerror("write error");}
+static inline void get(int fd, C &x, bool dimensions=0, bool transp=0) {if(sizeof(C)!=read(fd,&x,sizeof(C))) laerror("read error");}
 static void multiput(unsigned int n,int fd, const C *x, bool dimensions=0){if((ssize_t)(n*sizeof(C))!=write(fd,x,n*sizeof(C))) laerror("write error");}
 static void multiget(unsigned int n, int fd, C *x, bool dimensions=0) {if((ssize_t)(n*sizeof(C))!=read(fd,x,n*sizeof(C))) laerror("read error");}
 };
@@ -128,8 +128,8 @@ static inline bool bigger(const  C &x, const C &y) {return x>y;} \
 static inline bool smaller(const  C &x, const C &y) {return x<y;} \
 static inline normtype norm (const X<C> &x) {return x.norm();} \
 static inline void axpy (X<C>&s, const X<C> &x, const C c) {s.axpy(c,x);} \
-static void put(int fd, const C &x, bool dimensions=1) {x.put(fd,dimensions);} \
-static void get(int fd, C &x, bool dimensions=1) {x.get(fd,dimensions);} \
+static void put(int fd, const C &x, bool dimensions=1, bool transp=0) {x.put(fd,dimensions,transp);} \
+static void get(int fd, C &x, bool dimensions=1, bool transp=0) {x.get(fd,dimensions,transp);} \
 static void multiput(unsigned int n,int fd, const C *x, bool dimensions=1) {for(unsigned int i=0; i<n; ++i) x[i].put(fd,dimensions);} \
 static void multiget(unsigned int n,int fd, C *x, bool dimensions=1) {for(unsigned int i=0; i<n; ++i) x[i].get(fd,dimensions);} \
 };
@@ -153,8 +153,8 @@ static inline bool bigger(const  C &x, const C &y) {return x>y;}
 static inline bool smaller(const  C &x, const C &y) {return x<y;}
 static inline normtype norm (const NRSMat<C> &x) {return x.norm();} 
 static inline void axpy (NRSMat<C>&s, const NRSMat<C> &x, const C c) {s.axpy(c,x);} 
-static void put(int fd, const C &x, bool dimensions=1) {x.put(fd,dimensions);} 
-static void get(int fd, C &x, bool dimensions=1) {x.get(fd,dimensions);} 
+static void put(int fd, const C &x, bool dimensions=1, bool transp=0) {x.put(fd,dimensions);} 
+static void get(int fd, C &x, bool dimensions=1, bool transp=0) {x.get(fd,dimensions);} 
 static void multiput(unsigned int n,int fd, const C *x, bool dimensions=1) {for(unsigned int i=0; i<n; ++i) x[i].put(fd,dimensions);} \
 static void multiget(unsigned int n,int fd, C *x, bool dimensions=1) {for(unsigned int i=0; i<n; ++i) x[i].get(fd,dimensions);} \
 };
diff --git a/mat.cc b/mat.cc
index ade9c70..fa6d3e8 100644
--- a/mat.cc
+++ b/mat.cc
@@ -25,15 +25,28 @@ template NRMat<char>;
 
 //raw I/O
 template <typename T>
-void NRMat<T>::put(int fd, bool dim) const
+void NRMat<T>::put(int fd, bool dim, bool transp) const
 {
 errno=0;
 if(dim)
 {
-if(sizeof(int) != write(fd,&nn,sizeof(int))) laerror("cannot write");
-if(sizeof(int) != write(fd,&mm,sizeof(int))) laerror("cannot write");
+if(sizeof(int) != write(fd,&(transp?mm:nn),sizeof(int))) laerror("cannot write");
+if(sizeof(int) != write(fd,&(transp?nn:mm),sizeof(int))) laerror("cannot write");
 }
-LA_traits<T>::multiput(nn*mm,fd,
+
+if(transp)  //not particularly efficient
+        {
+        for(int j=0; j<mm; ++j)
+                for(int i=0; i<nn; ++i)
+                        LA_traits<T>::put(fd, 
+#ifdef MATPTR
+                                v[i][j]
+#else
+                                v[i*mm+j]
+#endif
+                        ,dim,transp); 
+        }
+else LA_traits<T>::multiput(nn*mm,fd,
 #ifdef MATPTR
         v[0]
 #else
@@ -43,7 +56,7 @@ LA_traits<T>::multiput(nn*mm,fd,
 }
 
 template <typename T>
-void NRMat<T>::get(int fd, bool dim)
+void NRMat<T>::get(int fd, bool dim, bool transp)
 {
 int nn0,mm0;
 errno=0;
@@ -51,11 +64,23 @@ if(dim)
 {
 if(sizeof(int) != read(fd,&nn0,sizeof(int))) laerror("cannot read");
 if(sizeof(int) != read(fd,&mm0,sizeof(int))) laerror("cannot read");
-resize(nn0,mm0);
+if(transp) resize(mm0,nn0); else resize(nn0,mm0);
 }
 else
 copyonwrite();
-LA_traits<T>::multiget(nn*mm,fd,
+if(transp)  //not particularly efficient
+	{
+	for(int j=0; j<mm; ++j)
+		for(int i=0; i<nn; ++i)
+			LA_traits<T>::get(fd,
+#ifdef MATPTR
+				v[i][j]
+#else
+				v[i*mm+j]
+#endif
+			,dim,transp);
+	}
+else LA_traits<T>::multiget(nn*mm,fd,
 #ifdef MATPTR
         v[0]
 #else
@@ -184,6 +209,24 @@ const NRVec<T> NRMat<T>::rsum() const
 	return result;
 }
 
+//block submatrix
+template <typename T>
+const NRMat<T> NRMat<T>::submatrix(const int fromrow, const int torow, const int fromcol, const int tocol) const
+{
+#ifdef DEBUG
+if(fromrow <0 ||fromrow >=nn||torow <0 ||torow >=nn ||fromcol<0||fromcol>=mm||tocol<0||tocol>=mm||fromrow>torow||fromcol>tocol) laerror("bad indices in submatrix");
+#endif
+int n=torow-fromrow+1;
+int m=tocol-fromcol+1;
+NRMat<T> r(n,m);
+for(int i=fromrow; i<=torow; ++i)
+#ifdef MATPTR
+	memcpy(r.v[i-fromrow],v[i]+fromcol,m*sizeof(T));
+#else
+	memcpy(r.v+(i-fromrow)*m,v+i*mm+fromcol,m*sizeof(T));
+#endif
+return r;
+}
 
 
 // transpose Mat
diff --git a/mat.h b/mat.h
index f0f6e23..88a798d 100644
--- a/mat.h
+++ b/mat.h
@@ -73,8 +73,8 @@ public:
 	inline int nrows() const;
 	inline int ncols() const;
 	inline int size() const;
-        void get(int fd, bool dimensions=1);
-        void put(int fd, bool dimensions=1) const;
+        void get(int fd, bool dimensions=1, bool transposed=false);
+        void put(int fd, bool dimensions=1, bool transposed=false) const;
 	void copyonwrite();
 	void resize(const int n, const int m);
 	inline operator T*(); //get a pointer to the data
@@ -83,6 +83,7 @@ public:
 	NRMat & conjugateme(); // square matrices only
 	const NRMat transpose(bool conj=false) const;
 	const NRMat conjugate() const;
+        const NRMat submatrix(const int fromrow, const int torow, const int fromcol, const int tocol) const; //there is also independent less efficient routine for generally indexed submatrix
 	void gemm(const T &beta, const NRMat &a, const char transa, const NRMat &b,
 			const char transb, const T &alpha);//this = alpha*op( A )*op( B ) + beta*this
 /*
diff --git a/nonclass.cc b/nonclass.cc
index caad05a..18cd396 100644
--- a/nonclass.cc
+++ b/nonclass.cc
@@ -134,7 +134,7 @@ static void linear_solve_do(NRMat<double> &A, double *B, const int nrhs, const i
 		//take into account some numerical instabilities in dgesv for singular matrices
 		for (int i=0; i<n; ++i)  {double t=A[i][i]; if(!finite(t) || abs(t) < EPSDET ) {*det=0.; break;} else *det *=t;}
 		//change sign of det by parity of ipiv permutation
-		if(*det) for (int i=0; i<n; ++i) *det = -(*det);
+		if(*det) for (int i=0; i<n; ++i) if(i!=ipiv[i]) *det = -(*det);
 	}
 	if(det && r>0) *det = 0;
 	delete [] ipiv;
@@ -643,6 +643,21 @@ NRMat<double> matrixfunction(NRSMat<double> a, double (*f) (double))
 	return r;
 }
 
+NRMat<double> realmatrixfunction(NRMat<double> a, double (*f) (double))
+{
+        int n = a.nrows();
+        NRVec<double> w(n);
+        diagonalize(a, w, true, false);
+
+        for (int i=0; i<a.nrows(); i++) w[i] = (*f)(w[i]);
+        NRMat<double> u = a;
+        a.diagmultl(w);
+        NRMat<double> r(n, n);
+        r.gemm(0.0, u, 't', a, 'n', 1.0);
+        return r;
+}
+
+
 // instantize template to an addresable function
 complex<double> myclog (const complex<double> &x) 
 {
diff --git a/nonclass.h b/nonclass.h
index 5235fb8..abb9602 100644
--- a/nonclass.h
+++ b/nonclass.h
@@ -73,10 +73,12 @@ extern void gdiagonalize(NRMat<double> &a, NRVec< complex<double> > &w,
 		 const bool corder=1, int n=0, const int sorttype=0, const bool biorthonormalize=0,
 		NRMat<double> *b=NULL, NRVec<double> *beta=NULL);
 extern NRMat<double> matrixfunction(NRSMat<double> a, double (*f) (double));
+extern NRMat<double> realmatrixfunction(NRMat<double> a, double (*f) (double)); //a has to by in fact symmetric
 extern NRMat<double> matrixfunction(NRMat<double> a, complex<double> (*f)(const complex<double> &),const bool adjust=0);
 
 //functions on matrices
 inline NRMat<double>  sqrt(const NRSMat<double> &a) { return matrixfunction(a,&sqrt); }
+inline NRMat<double>  realsqrt(const NRMat<double> &a) { return realmatrixfunction(a,&sqrt); }
 inline NRMat<double>  log(const NRSMat<double> &a) { return matrixfunction(a,&log); }
 extern NRMat<double> log(const NRMat<double> &a);
 
@@ -119,6 +121,7 @@ linear_solve(a,NULL,&det);
 return det;
 }
 
+
 //general submatrix, INDEX will typically be NRVec<int> or even int*
 //NOTE: in order to check consistency between nrows and rows in rows is a NRVec
 //some advanced metaprogramming would be necessary
diff --git a/qsort.h b/qsort.h
index 15b0d79..8051b2a 100644
--- a/qsort.h
+++ b/qsort.h
@@ -1,3 +1,5 @@
+#ifndef _QSORT_H
+#define _QSORT_H
 //general quicksort suitable if we do not want to create extra class for the member type but prefer to encapsulate it into compare and swap soutines
 //returns parity of the permutation
 
@@ -39,3 +41,5 @@ else
 	{if(i<r) parity ^=genqsort(i,r,cmp,swap); if(l<j) parity ^=genqsort(l,j,cmp,swap);}
 return parity;
 }
+
+#endif
diff --git a/smat.cc b/smat.cc
index deccb63..fe13588 100644
--- a/smat.cc
+++ b/smat.cc
@@ -28,7 +28,7 @@ template NRSMat<char>;
 
 //raw I/O
 template <typename T>
-void NRSMat<T>::put(int fd, bool dim) const
+void NRSMat<T>::put(int fd, bool dim, bool transp) const
 {
 errno=0;
 if(dim)
@@ -40,7 +40,7 @@ LA_traits<T>::multiput(NN2,fd,v,dim);
 }
 
 template <typename T>
-void NRSMat<T>::get(int fd, bool dim)
+void NRSMat<T>::get(int fd, bool dim, bool transp)
 {
 int nn0[2]; //align at least 8-byte
 errno=0;
diff --git a/smat.h b/smat.h
index 2f09ab5..ffd0708 100644
--- a/smat.h
+++ b/smat.h
@@ -56,8 +56,8 @@ public:
 	void axpy(const T alpha, const NRSMat &x); // this+= a*x
 	inline const T amax() const;
 	const T trace() const;
-	void get(int fd, bool dimensions=1);
-        void put(int fd, bool dimensions=1) const;
+	void get(int fd, bool dimensions=1, bool transp=0);
+        void put(int fd, bool dimensions=1, bool transp=0) const;
 	void copyonwrite();
 	void resize(const int n);
 	inline operator T*(); //get a pointer to the data
diff --git a/sparsemat.cc b/sparsemat.cc
index c584fe7..6ed5d6f 100644
--- a/sparsemat.cc
+++ b/sparsemat.cc
@@ -60,14 +60,14 @@ extern ssize_t write(int, const void *, size_t);
 
 
 export template <class T>
-void SparseMat<T>::get(int fd, bool dimen)
+void SparseMat<T>::get(int fd, bool dimen, bool transp)
 {
 errno=0;
 SPMatindex dim[2];
 if(dimen)
 {
 if(2*sizeof(SPMatindex) != read(fd,&dim,2*sizeof(SPMatindex))) laerror("cannot read");
-resize(dim[0],dim[1]);
+if(transp) resize(dim[1],dim[0]); else resize(dim[0],dim[1]);
 int symnon[2];
 if(2*sizeof(int) != read(fd,&symnon,2*sizeof(int))) laerror("cannot read");
 symmetric=symnon[0];
@@ -84,22 +84,21 @@ do
           matel<T> *ll = l;
           l= new matel<T>;
           l->next= ll;
-          l->row=dim[0];
-          l->col=dim[1];
-	  LA_traits<T>::get(fd,l->elem,dimen); //general way to work when elem is some complex class again
+          if(transp) {l->row=dim[1]; l->col=dim[0];} else {l->row=dim[0]; l->col=dim[1];}
+	  LA_traits<T>::get(fd,l->elem,dimen,transp); //general way to work when elem is some complex class again
           } while(1);
 list=l;
 }
 
 
 export template <class T>
-void SparseMat<T>::put(int fd,bool dimen) const
+void SparseMat<T>::put(int fd,bool dimen, bool transp) const
 {
 errno=0;
 if(dimen)
 {
-if(sizeof(SPMatindex) != write(fd,&nn,sizeof(SPMatindex))) laerror("cannot write");
-if(sizeof(SPMatindex) != write(fd,&mm,sizeof(SPMatindex))) laerror("cannot write");
+if(sizeof(SPMatindex) != write(fd,&(transp ? mm : nn),sizeof(SPMatindex))) laerror("cannot write");
+if(sizeof(SPMatindex) != write(fd,&(transp ? nn : mm),sizeof(SPMatindex))) laerror("cannot write");
 int symnon[2];
 symnon[0]=symmetric;
 symnon[1]=nonzero;
@@ -108,9 +107,9 @@ if(2*sizeof(int) != write(fd,symnon,2*sizeof(int))) laerror("cannot write");
 matel<T> *l=list;
 while(l)
 	{
-	if(sizeof(SPMatindex) != write(fd,&l->row,sizeof(SPMatindex))) laerror("cannot write");
-	if(sizeof(SPMatindex) != write(fd,&l->col,sizeof(SPMatindex))) laerror("cannot write");
-	LA_traits<T>::put(fd,l->elem,dimen);//general way to work when elem is some non-scalar class again
+	if(sizeof(SPMatindex) != write(fd,&(transp ? l->col : l->row),sizeof(SPMatindex))) laerror("cannot write");
+	if(sizeof(SPMatindex) != write(fd,&(transp ? l->row : l->col),sizeof(SPMatindex))) laerror("cannot write");
+	LA_traits<T>::put(fd,l->elem,dimen,transp);//general way to work when elem is some non-scalar class again
 	l=l->next;
 	}
 SPMatindex sentinel[2];
@@ -1082,6 +1081,54 @@ return result;
 }
 
 
+template<class T>
+void SparseMat<T>::permuterows(const NRVec<SPMatindex> &p)
+{
+if((SPMatindex)p.size()!=nn) laerror("inconsistent dimension in permuterows");
+
+matel<T> *l=list;
+
+while(l)
+	{
+	l->row = p[l->row];
+	if(symmetric) l->col= p[l->col];
+	l=l->next;
+	}
+}
+
+
+template<class T>
+void SparseMat<T>::permutecolumns(const NRVec<SPMatindex> &p)
+{
+if((SPMatindex)p.size()!=nn) laerror("inconsistent dimension in permuterows");
+
+matel<T> *l=list;
+
+while(l)
+        {
+        if(symmetric) l->row = p[l->row];
+        l->col= p[l->col];
+        l=l->next;
+        }
+}
+
+template<class T>
+void SparseMat<T>::permuteindices(const NRVec<SPMatindex> &p)
+{
+if((SPMatindex)p.size()!=nn) laerror("inconsistent dimension in permuterows");
+
+matel<T> *l=list;
+
+while(l)
+        {
+        l->row = p[l->row];
+        l->col= p[l->col];
+        l=l->next;
+        }
+}
+
+
+
 
 template<class T>
 const SparseMat<T> SparseMat<T>::operator*(const SparseMat<T> &rhs) const
@@ -1177,6 +1224,8 @@ for(i=0; i<na;i++)
 simplify();
 }
 
+
+
 //direct sum and product -- only partly implemented at the moment
 export template<typename T>
 SparseMat<T> & SparseMat<T>::oplusequal(const NRMat<T> &rhs)
@@ -1196,6 +1245,8 @@ for(SPMatindex i=0; i<(SPMatindex)rhs.nrows(); ++i)
 return *this;
 }
 
+
+
 export template<typename T>
 SparseMat<T> & SparseMat<T>::oplusequal(const NRSMat<T> &rhs)
 {
@@ -1214,6 +1265,8 @@ for(SPMatindex i=0; i<(SPMatindex)rhs.nrows(); ++i)
 return *this;
 }
 
+
+
 export template <class T>
 SparseMat<T> & SparseMat<T>::oplusequal(const SparseMat<T> &rhs)
 {
@@ -1245,8 +1298,8 @@ template SparseMat<T> & SparseMat<T>::oplusequal(const NRMat<T> &rhs);\
 template SparseMat<T> & SparseMat<T>::oplusequal(const NRSMat<T> &rhs);\
 template ostream& operator<<(ostream &s, const SparseMat<T> &x); \
 template istream& operator>>(istream &s, SparseMat<T> &x); \
-template void SparseMat<T>::get(int fd, bool dimen); \
-template void SparseMat<T>::put(int fd, bool dimen) const; \
+template void SparseMat<T>::get(int fd, bool dimen, bool transp); \
+template void SparseMat<T>::put(int fd, bool dimen, bool transp) const; \
 template void SparseMat<T>::copyonwrite(); \
 template void SparseMat<T>::unsort(); \
 template void SparseMat<T>::resize(const SPMatindex n, const SPMatindex m); \
@@ -1280,6 +1333,10 @@ template const SparseMat<T> SparseMat<T>::operator*(const SparseMat<T> &rhs) con
 template const T SparseMat<T>::dot(const SparseMat<T> &rhs) const; \
 template void SparseMat<T>::gemm(const T beta, const SparseMat<T> &a, const char transa, const SparseMat<T> &b, const char transb, const T alpha); \
 template void NRVec<T>::gemv(const T beta, const SparseMat<T> &a, const char trans, const T alpha, const NRVec<T> &x);\
+template void SparseMat<T>::permuterows(const NRVec<SPMatindex> &p);\
+template void SparseMat<T>::permutecolumns(const NRVec<SPMatindex> &p);\
+template void SparseMat<T>::permuteindices(const NRVec<SPMatindex> &p);\
+
 
 
 INSTANTIZE(double)
diff --git a/sparsemat.h b/sparsemat.h
index 9f5b259..bc15f5a 100644
--- a/sparsemat.h
+++ b/sparsemat.h
@@ -107,12 +107,15 @@ public:
 	void setlist(matel<T> *l) {list=l;}
 	inline SPMatindex nrows() const {return nn;}
         inline SPMatindex ncols() const {return mm;}
-	void get(int fd, bool dimensions=1);
-	void put(int fd, bool dimensions=1) const;
+	void get(int fd, bool dimensions=1, bool transposed=false);
+	void put(int fd, bool dimensions=1, bool transposed=false) const;
 	void resize(const SPMatindex n, const SPMatindex m); //destructive
 	void incsize(const SPMatindex n, const SPMatindex m); //increase size without destroying the data
 	void transposeme();
 	const SparseMat transpose() const;
+	void permuteindices(const NRVec<SPMatindex> &p);
+	void permuterows(const NRVec<SPMatindex> &p);
+	void permutecolumns(const NRVec<SPMatindex> &p);
 	inline void setsymmetric() {if(nn!=mm) laerror("non-square cannot be symmetric"); symmetric=1;}
 	inline void defineunsymmetric() {symmetric=0;} //just define and do nothing with it
 	void setunsymmetric();//unwind the matrix assuming it was indeed symmetric
diff --git a/t.cc b/t.cc
index 9b9cca0..464cce4 100644
--- a/t.cc
+++ b/t.cc
@@ -512,7 +512,7 @@ cin>>n;
 	cout <<"difference = "<<(res1-res2).norm()<<endl;
 }
 
-if(1)
+if(0)
 {
 int n,k,m;
 cin >>n>>k>>m;
@@ -1095,5 +1095,15 @@ for(int iter=1; iter<100 && norm>1e-8 ; ++iter)
 	}
 }
 
+if(1)
+{
+NRMat<double> a,b;
+cin >>a;
+b=realsqrt(a);
+cout <<b;
+cout <<b*b;
+cout <<(b*b-a).norm();
+}
+
 
 }
diff --git a/vec.cc b/vec.cc
index 5e79a1f..377938c 100644
--- a/vec.cc
+++ b/vec.cc
@@ -15,8 +15,8 @@ extern ssize_t write(int, const void *, size_t);
 #define INSTANTIZE(T) \
 template ostream & operator<<(ostream &s, const NRVec< T > &x); \
 template istream & operator>>(istream  &s, NRVec< T > &x); \
-template void NRVec<T>::put(int fd, bool dim) const; \
-template void NRVec<T>::get(int fd, bool dim); \
+template void NRVec<T>::put(int fd, bool dim, bool transp) const; \
+template void NRVec<T>::get(int fd, bool dim, bool transp); \
 
 
 
@@ -82,7 +82,7 @@ istream & operator>>(istream &s, NRVec<T> &x)
 
 //raw I/O
 template <typename T>
-void NRVec<T>::put(int fd, bool dim) const
+void NRVec<T>::put(int fd, bool dim, bool transp) const
 {
 errno=0;
 int pad=1; //align at least 8-byte
@@ -95,7 +95,7 @@ LA_traits<T>::multiput(nn,fd,v,dim);
 }
 
 template <typename T>
-void NRVec<T>::get(int fd, bool dim)
+void NRVec<T>::get(int fd, bool dim, bool transp)
 {
 int nn0[2]; //align at least 8-byte
 errno=0;
diff --git a/vec.h b/vec.h
index b48f613..ef306aa 100644
--- a/vec.h
+++ b/vec.h
@@ -90,8 +90,8 @@ public:
 	void axpy(const T alpha, const T *x, const int stride=1); // this+= a*x
 	void copyonwrite();
 	void resize(const int n);
-        void get(int fd, bool dimensions=1);
-        void put(int fd, bool dimensions=1) const;
+        void get(int fd, bool dimensions=1, bool transp=0);
+        void put(int fd, bool dimensions=1, bool transp=0) const;
 	NRVec & normalize();
 	inline const double norm() const;
 	inline const T amax() const;