diff --git a/mat.cc b/mat.cc
index 0d972f6..d6bb304 100644
--- a/mat.cc
+++ b/mat.cc
@@ -27,6 +27,7 @@
 #include <fcntl.h>
 #include <errno.h>
 #include <unistd.h>
+#include <math.h>
 
 
 namespace LA {
@@ -2754,6 +2755,7 @@ NRMat<T>& NRMat<T>::swap_rows(){
         return *this;
 }
 
+
 /***************************************************************************//**
  * interchange the order of the columns of the current (real) matrix
  * @return reference to the modified matrix
@@ -2925,6 +2927,61 @@ NRMat<T>& NRMat<T>::swap_rows(const int a, const int b){
         return *this;
 }
 
+
+/*rotate rows or columns of a matrix - general implementation, more efficient version could be done with BLAS scal and axpy operations
+ *  but it would require allocation of temporary storage
+ */
+
+template<typename T>
+NRMat<T>& NRMat<T>::rotate_rows(const int a, const int b, const T phi){
+        T tmp1,tmp2;
+        copyonwrite();
+	T c=cos(phi);
+	T s=sin(phi);
+#ifdef CUDALA
+        if(location == cpu){
+#endif
+                        for(register int j=0;j<mm;j++){
+                                tmp1 = (*this)(a,j);
+                                tmp2 = (*this)(b,j);
+                                (*this)(a,j) = c*tmp1 + s*tmp2;
+                                (*this)(b,j) = c*tmp2 - s*tmp1;
+                        }
+#ifdef CUDALA
+        }else{
+		laerror("rotate_rows not implemented on gpu");
+        }
+#endif
+        return *this;
+}
+
+template<typename T>
+NRMat<T>& NRMat<T>::rotate_cols(const int a, const int b, const T phi){
+        T tmp1,tmp2;
+        copyonwrite();
+        T c=cos(phi);
+        T s=sin(phi);
+#ifdef CUDALA
+        if(location == cpu){
+#endif
+                        for(register int j=0;j<nn;j++){
+                                tmp1 = (*this)(j,a);
+                                tmp2 = (*this)(j,b);
+                                (*this)(j,a) = c*tmp1 + s*tmp2;
+                                (*this)(j,b) = c*tmp2 - s*tmp1;
+                        }
+#ifdef CUDALA
+        }else{
+                laerror("rotate_rows not implemented on gpu");
+        }
+#endif
+        return *this;
+}
+
+
+
+
+
 /***************************************************************************//**
  * interchange the order of the rows and columns of the current
  * real matrix \f$A\f$ of type T, i.e. perform the operation
@@ -3075,9 +3132,23 @@ NRMat<T>& NRMat<T>::swap_rows_cols(){
         return *this;
 }
 
+//permutation matrix
+template<typename T>
+NRMat<T>::NRMat(const NRPerm<int> &p, const bool direction)
+{
+int n=p.size();
+resize(n,n);
+clear();
+for(int i=0; i<n; ++i)
+	{
+	if(direction) (*this)(i,p[i+1]-1)=1;
+	else (*this)(p[i+1]-1,i)=1;
+	}
+}
+
 //apply permutations
 template<typename T>
-const NRMat<T> NRMat<T>::permute_rows(const NRPerm<int> &p) const
+const NRMat<T> NRMat<T>::permuted_rows(const NRPerm<int> &p, const bool inverse) const
 {
 #ifdef DEBUG
 if(!p.is_valid()) laerror("invalid permutation of matrix");
@@ -3088,12 +3159,13 @@ if(n!=nn) laerror("incompatible permutation and matrix");
         if(this->getlocation() != cpu || p.getlocation() != cpu ) laerror("permutations can be done only in CPU memory");
 #endif
 NRMat<T> r(nn,mm);
-for(int i=1; i<=n; ++i) {int pi=p[i]-1; for(int j=0; j<mm; ++j)  r(i-1,j) = (*this)(pi,j);}
+if(inverse) for(int i=1; i<=n; ++i) {int pi=p[i]-1; for(int j=0; j<mm; ++j)  r(i-1,j) = (*this)(pi,j);}
+else for(int i=1; i<=n; ++i) {int pi=p[i]-1; for(int j=0; j<mm; ++j)  r(pi,j) = (*this)(i-1,j);}
 return r;
 }
 
 template<typename T>
-const NRMat<T> NRMat<T>::permute_cols(const NRPerm<int> &p) const
+const NRMat<T> NRMat<T>::permuted_cols(const NRPerm<int> &p, const bool inverse) const
 {
 #ifdef DEBUG
 if(!p.is_valid()) laerror("invalid permutation of matrix");
@@ -3104,12 +3176,13 @@ if(n!=mm) laerror("incompatible permutation and matrix");
         if(this->getlocation() != cpu || p.getlocation() != cpu ) laerror("permutations can be done only in CPU memory");
 #endif
 NRMat<T> r(nn,mm);
-for(int i=1; i<=n; ++i) {int pi=p[i]-1; for(int j=0; j<nn; ++j)  r(j,i-1) = (*this)(j,pi);}
+if(inverse) for(int i=1; i<=n; ++i) {int pi=p[i]-1; for(int j=0; j<nn; ++j)  r(j,i-1) = (*this)(j,pi);}
+else for(int i=1; i<=n; ++i) {int pi=p[i]-1; for(int j=0; j<nn; ++j)  r(j,pi) = (*this)(j,i-1);}
 return r;
 }
 
 template<typename T>
-const NRMat<T> NRMat<T>::permute_both(const NRPerm<int> &p, const NRPerm<int> &q) const
+const NRMat<T> NRMat<T>::permuted_both(const NRPerm<int> &p, const NRPerm<int> &q, const bool inverse) const
 {
 #ifdef DEBUG
 if(!p.is_valid() || !q.is_valid() ) laerror("invalid permutation of matrix");
@@ -3121,11 +3194,171 @@ if(n!=nn ||m!=mm) laerror("incompatible permutation and matrix");
         if(this->getlocation() != cpu || p.getlocation() != cpu ) laerror("permutations can be done only in CPU memory");
 #endif
 NRMat<T> r(nn,mm);
-for(int i=1; i<=n; ++i) {int pi=p[i]-1; for(int j=1; j<=m; ++j)  r(i-1,j-1) = (*this)(pi,q[j]-1);}
+if(inverse) for(int i=1; i<=n; ++i) {int pi=p[i]-1; for(int j=1; j<=m; ++j)  r(i-1,j-1) = (*this)(pi,q[j]-1);}
+else for(int i=1; i<=n; ++i) {int pi=p[i]-1; for(int j=1; j<=m; ++j)  r(pi,q[j]-1) = (*this)(i-1,j-1);}
 return r;
 }
 
 
+template<typename T>
+void NRMat<T>::permuteme_rows(const CyclePerm<int> &p)
+{
+#ifdef DEBUG
+if(!p.is_valid()) laerror("invalid permutation of matrix");
+#endif
+if(p.max()>nn) laerror("incompatible permutation and matrix");
+#ifdef CUDALA
+        if(this->getlocation() != cpu || p.getlocation() != cpu ) laerror("permutations can be done only in CPU memory");
+#endif
+copyonwrite();
+T *tmp = new T[mm];
+for(int cycle=1; cycle<=p.size(); ++cycle)
+        {
+        int length= p[cycle].size();
+        if(length<=1) continue; //trivial cycle
+        for(int j=0; j<mm; ++j) tmp[j] = (*this)(p[cycle][length]-1,j);
+        for(int i=length; i>1; --i) 
+		for(int j=0; j<mm; ++j) (*this)(p[cycle][i]-1,j)=(*this)(p[cycle][i-1]-1,j);
+	for(int j=0; j<mm; ++j) (*this)(p[cycle][1]-1,j)=tmp[j];
+        }
+delete[] tmp;
+}
+
+template<typename T>
+void NRMat<T>::permuteme_cols(const CyclePerm<int> &p)
+{
+#ifdef DEBUG
+if(!p.is_valid()) laerror("invalid permutation of matrix");
+#endif
+if(p.max()>mm) laerror("incompatible permutation and matrix");
+#ifdef CUDALA
+        if(this->getlocation() != cpu || p.getlocation() != cpu ) laerror("permutations can be done only in CPU memory");
+#endif
+copyonwrite();
+T *tmp = new T[nn];
+for(int cycle=1; cycle<=p.size(); ++cycle)
+        {
+        int length= p[cycle].size();
+        if(length<=1) continue; //trivial cycle
+        for(int j=0; j<nn; ++j) tmp[j] = (*this)(j,p[cycle][length]-1);
+        for(int i=length; i>1; --i) 
+                for(int j=0; j<nn; ++j) (*this)(j,p[cycle][i]-1)=(*this)(j,p[cycle][i-1]-1);
+        for(int j=0; j<nn; ++j) (*this)(j,p[cycle][1]-1)=tmp[j];
+        }
+delete[] tmp;
+}
+
+
+//double and complex specialization
+template<>
+void NRMat<double>::scale_row(const int i, const double f)
+{
+#ifdef DEBUG
+if(i<0||i>=nn) laerror("index out of range in scale_row");
+#endif
+copyonwrite();
+#ifdef CUDALA
+        if(location == cpu) {
+#endif
+                cblas_dscal(mm, f, &(*this)(i,0), 1);
+#ifdef CUDALA
+        }else{
+                cublasDscal(mm, f, v+i*mm, 1);
+                TEST_CUBLAS("cublasDscal");
+        }
+#endif
+}
+
+template<>
+void NRMat<double>::scale_col(const int i, const double f)
+{
+#ifdef DEBUG
+if(i<0||i>=mm) laerror("index out of range in scale_col");
+#endif
+copyonwrite();
+#ifdef CUDALA
+        if(location == cpu) {
+#endif
+                cblas_dscal(nn, f, &(*this)(0,i), mm);
+#ifdef CUDALA
+        }else{
+                cublasDscal(nn, f, v+i, mm);
+                TEST_CUBLAS("cublasDscal");
+        }
+#endif
+}
+
+
+template<>
+void NRMat<std::complex<double> >::scale_row(const int i, const std::complex<double> f)
+{
+#ifdef DEBUG
+if(i<0||i>=nn) laerror("index out of range in scale_row");
+#endif
+copyonwrite();
+#ifdef CUDALA
+        if(location == cpu) {
+#endif
+                cblas_zscal(mm, &f, &(*this)(i,0), 1);
+#ifdef CUDALA
+        }else{
+		const cuDoubleComplex fac = *(reinterpret_cast<const cuDoubleComplex*> (&f));
+                cublasZscal(mm, &fac, v+i*mm, 1);
+                TEST_CUBLAS("cublasDscal");
+        }
+#endif
+}
+
+template<>
+void NRMat<std::complex<double> >::scale_col(const int i, const std::complex<double> f)
+{
+#ifdef DEBUG
+if(i<0||i>=mm) laerror("index out of range in scale_col");
+#endif
+copyonwrite();
+#ifdef CUDALA
+        if(location == cpu) {
+#endif
+                cblas_zscal(nn, &f, &(*this)(0,i), mm);
+#ifdef CUDALA
+        }else{
+		const cuDoubleComplex fac = *(reinterpret_cast<const cuDoubleComplex*> (&f));
+                cublasZscal(nn, &fac, v+i, mm);
+                TEST_CUBLAS("cublasDscal");
+        }
+#endif
+}
+
+
+
+
+
+
+//general version
+template<typename T>
+void NRMat<T>::scale_row(const int i, const T f)
+{
+#ifdef DEBUG
+if(i<0||i>=nn) laerror("index out of range in scale_row");
+#endif
+copyonwrite();
+for(int j=0; j<mm; ++j) (*this)(i,j) *= f;
+}
+
+
+template<typename T>
+void NRMat<T>::scale_col(const int i, const T f)
+{
+#ifdef DEBUG
+if(i<0||i>=mm) laerror("index out of range in scale_col");
+#endif
+copyonwrite();
+for(int j=0; j<nn; ++j) (*this)(j,i) *= f;
+}
+
+
+
+
 
 
 /***************************************************************************//**
diff --git a/mat.h b/mat.h
index 0b1eb41..089b030 100644
--- a/mat.h
+++ b/mat.h
@@ -117,9 +117,14 @@ public:
 	void copyonwrite(bool detachonly=false);
 
 	//! permute matrix elements
-        const NRMat permute_rows(const NRPerm<int> &p) const;
-        const NRMat permute_cols(const NRPerm<int> &p) const;
-        const NRMat permute_both(const NRPerm<int> &p, const NRPerm<int> &q) const;
+        const NRMat permuted_rows(const NRPerm<int> &p, const bool inverse=false) const;
+        const NRMat permuted_cols(const NRPerm<int> &p, const bool inverse=false) const;
+        const NRMat permuted_both(const NRPerm<int> &p, const NRPerm<int> &q, const bool inverse=false) const;
+	void permuteme_rows(const CyclePerm<int> &p); //in place
+	void permuteme_cols(const CyclePerm<int> &p); //in place
+	void scale_row(const int i, const T f); //in place
+	void scale_col(const int i, const T f); //in place
+	explicit NRMat(const NRPerm<int> &p, const bool direction); //permutation matrix
 
 
 	/***************************************************************************//**
@@ -349,6 +354,11 @@ public:
 	// LV - swapping of rows i and j
 	NRMat & swap_rows(const int i, const int j);
 
+	//rotate rows or columns through an angle
+	NRMat & rotate_cols(const int i, const int j, const T phi);
+        NRMat & rotate_rows(const int i, const int j, const T phi);
+
+
 	//! multiply by sparse matrix
 	SparseSMat<T> operator*(const SparseSMat<T> &rhs) const;
 
diff --git a/permutation.cc b/permutation.cc
index 642673b..ea69e25 100644
--- a/permutation.cc
+++ b/permutation.cc
@@ -17,6 +17,10 @@
 */
 
 #include "permutation.h"
+#include <stdio.h>
+#include <string.h>
+
+
 namespace LA {
 
 template <typename T>
@@ -27,6 +31,7 @@ T n=this->size();
         if(n<0) laerror("invalid permutation size");
 #endif
 if(n==0) return;
+this->copyonwrite();
 for(T i=1; i<=n; ++i) (*this)[i]=i;
 }
 
@@ -118,12 +123,14 @@ return (count&1)? -1:1;
 }
 
 template <typename T>
-NRPerm<T>::NRPerm(const CyclePerm<T> &rhs, int n)
-: NRVec_from1<T>(n)
+NRPerm<T>::NRPerm(const CyclePerm<T> &rhs, const int n)
 {
 #ifdef DEBUG
         if(!rhs.is_valid()) laerror("invalid cycle permutation");
 #endif
+int m;
+if(n) m=n; else m=rhs.max();
+this->resize(m);
 
 identity();
 T ncycles=rhs.size();
@@ -137,6 +144,22 @@ if(!is_valid()) laerror("internal error in NRPerm constructor from CyclePerm");
 #endif
 }
 
+template <typename T>
+void NRPerm<T>::randomize(void)
+{
+int n=this->size();
+if(n<=0) laerror("cannot randomize empty permutation");
+this->copyonwrite();
+this->identity();
+for(int i=n-1; i>=1; --i)
+	{
+	int j= random()%(i+1);
+	T tmp = (*this)[i+1];
+	(*this)[i+1]=(*this)[j+1];
+	(*this)[j+1]=tmp;
+	}
+}
+
 
 ////////////////////////////////////////////////////////
 
@@ -150,7 +173,7 @@ T n=p.size();
 NRVec_from1<T> used(0,n),tmp(n);
 T firstunused=1;
 T currentcycle=0;
-std::list<NRVec<T> > cyclelist={};
+std::list<NRVec_from1<T> > cyclelist={};
 do
         {
         //find a cycle starting with first unused element
@@ -196,7 +219,7 @@ for(T i=1; i<=this->size(); ++i)
 		for(T ii=i; ii<=this->size(); ++ii)
 			{
 			T nn=(*this)[ii].size();
-			for(T jj=1; jj<=nn; ++jj)
+			for(T jj=(ii==i?j+1:1); jj<=nn; ++jj)
 				{
 				T xx=(*this)[ii][jj];
 				if(x==xx) return false;
@@ -233,11 +256,24 @@ for(T i=1; i<=ncycles; ++i)
 	{
 	T length=(*this)[i].size();
 	r[i].resize(length);
+	//reverse order in cycles (does not matter in cycle lengths 1 and 2 anyway)
 	for(T j=1; j<=length; ++j) r[i][j] = (*this)[i][length-j+1];
 	}
 return r;
 }
 
+//multiplication via NRPerm - could there be a more efficient direct algorithm?
+template <typename T>
+CyclePerm<T> CyclePerm<T>::operator*(const CyclePerm q) const
+{
+int m=this->max();
+int mm=q.max();
+if(mm>m) mm=m;
+NRPerm<T> qq(q,m);
+NRPerm<T> pp(*this,m);
+NRPerm<T> rr=pp*qq;
+return CyclePerm<T>(rr);
+}
 
 template <typename T>
 int CyclePerm<T>::parity() const
@@ -271,12 +307,105 @@ for(T i=1; i<=ncycles; ++i)
 	r[length]++;
 	}
 //fill in trivial cycles of length one
-r[1] = n - r.sum();
+r[1] += n - r.sum();
 if(r[1]<0) laerror("inconsistent cycle lengths in CyclePerm::cycles");
 return r;
 }
 
 
+//auxiliary function for input of a permutation in cycle format
+//returns pointer after closing bracket or NULL if no cycle found
+//or input error
+template <typename T>
+const char *read1cycle(NRVec_from1<T> &c, const char *p)
+{
+if(*p==0) return NULL;
+const char *openbracket = strchr(p,'(');
+if(!openbracket) return NULL;
+const char *closebracket = strchr(openbracket+1,')');
+if(!closebracket) return NULL;
+const char *s = openbracket+1;
+int r;
+int length=0;
+std::list<T> cycle;
+do      {
+	long int tmp;
+        int nchar;
+        if(*s==',') ++s;
+        r = sscanf(s,"%ld%n",&tmp,&nchar);
+        if(r==1)
+                {
+                ++length;
+                s += nchar;
+		cycle.push_back((T)tmp);
+                }
+        }
+        while(r==1 && s<closebracket);
+
+//make vector from list
+c.resize(length);
+int i=0;
+for(auto l=cycle.begin(); l!=cycle.end(); ++l) c[++i] = *l;
+
+return closebracket+1;
+}
+
+template <typename T>
+void CyclePerm<T>::readfrom(const std::string &line)
+{
+const char *p=line.c_str();
+std::list<NRVec<T> > cyclelist={};
+int ncycles=0;
+int count=0;
+NRVec_from1<T> c;
+while(p=read1cycle(c,p))
+        {
+        //printf("cycle %d of length %d read\n",count,c.size());
+        if(c.size()!=0) //store a nonempty cycle
+                {
+                ++count;
+		cyclelist.push_back(c);
+                }
+        }
+
+
+//convert list to vector
+this->resize(count);
+T i=0;
+for(auto l=cyclelist.begin(); l!=cyclelist.end(); ++l) (*this)[++i] = *l;
+#ifdef DEBUG
+if(!this->is_valid()) laerror("readfrom received input of invalid CyclePerm");
+#endif
+}
+
+
+template <typename T>
+std::istream & operator>>(std::istream &s, CyclePerm<T> &x)
+{
+std::string l;
+getline(s,l);
+x.readfrom(l);
+return s;
+}
+
+template <typename T>
+std::ostream & operator<<(std::ostream &s, const CyclePerm<T> &x)
+{
+for(int i=1; i<=x.size(); ++i)
+	{
+	s<<"(";
+	for(int j=1; j<=x[i].size(); ++j)
+		{
+		s<<x[i][j];
+		if(j<x[i].size()) s<<" ";
+		}
+	s<<")";
+	}
+return s;
+}
+
+
+
 ///////////////////////////////////////////////////////
 
 
@@ -286,4 +415,16 @@ return r;
 template class NRPerm<int>;
 template class CyclePerm<int>;
 template class Partition<int>;
-}
+
+#define INSTANTIZE(T) \
+template std::istream & operator>>(std::istream &s, CyclePerm<T> &x); \
+template std::ostream & operator<<(std::ostream &s, const CyclePerm<T> &x); \
+
+
+
+INSTANTIZE(int)
+
+
+
+
+}//namespace
diff --git a/permutation.h b/permutation.h
index 8c651f4..db3258e 100644
--- a/permutation.h
+++ b/permutation.h
@@ -41,7 +41,7 @@ public:
 	NRPerm(const NRVec_from1<T> &rhs): NRVec_from1<T>(rhs) {};
 	NRPerm(const T &a, const int n): NRVec_from1<T>(a, n) {};
         NRPerm(const T *a, const int n): NRVec_from1<T>(a, n) {};
-	NRPerm(const CyclePerm<T> &rhs, int n); 
+	explicit NRPerm(const CyclePerm<T> &rhs, const int n=0); 
 
 	//specific operations
 	void identity();
@@ -51,10 +51,10 @@ public:
 	NRPerm operator*(const NRPerm q) const; //q is rhs and applied first, this applied second
 	NRPerm conjugate_by(const NRPerm q) const; //q^-1 p q
 	int parity() const;
+	void randomize(void); //uniformly random by Fisher-Yates shuffle
 
 
 	//TODO:
-	//@@@permutation matrix
 	//@@@permgener
 	//@@@next permutation
 	//@@@lex rank
@@ -67,19 +67,27 @@ template <typename T>
 class CyclePerm : public NRVec_from1<NRVec_from1<T> > {
 public:
 	CyclePerm() :  NRVec_from1<NRVec_from1<T> >() {};
-	CyclePerm(const NRPerm<T> &rhs);
+	explicit CyclePerm(const NRPerm<T> &rhs);
 
 	bool is_valid() const; //is it really a permutation
 	bool is_identity() const; //no cycles of length > 1
 	CyclePerm inverse() const; //reverse all cycles
 	int parity() const; //negative if having odd number of even-length cycles
+	T max() const {T m=0; for(int i=1; i<=this->size(); ++i) {T mm= (*this)[i].max(); if(mm>m) m=mm;} return m;}
 	Partition<T> cycles(const T n) const;
-	//@@@efficient algorithm for multiplication?
-	//@@@operator >> and <<
-	//@@@operation in place on matrix and vector
+	void readfrom(const std::string &line);
+	CyclePerm operator*(const CyclePerm q) const; //q is rhs and applied first, this applied second
 };
 
 
+template <typename T>
+std::istream & operator>>(std::istream &s, CyclePerm<T> &x);
+
+template <typename T>
+std::ostream & operator<<(std::ostream &s, const CyclePerm<T> &x);
+
+
+
 //partitions stored as #of 1s, #of 2s, etc.
 template <typename T>
 class Partition : public NRVec_from1<T> {
@@ -94,8 +102,8 @@ public:
 //@@@generate all partitions, 
 //@@@enumerator of partitions of n to r parts and total
 //@@@adjoint partition, 
-//@@@ output as in the group character table
-//@@@Sn character table
+//@@@output formatted as in the group character table
+//@@@Sn character table computation
 };
 
 
diff --git a/smat.cc b/smat.cc
index 75101bb..8d7c3b1 100644
--- a/smat.cc
+++ b/smat.cc
@@ -305,7 +305,7 @@ void NRSMat<T>::fscanf(FILE *f, const char *format) {
 
 //apply permutation
 template <typename T>
-const NRSMat<T> NRSMat<T>::permute(const NRPerm<int> &p) const
+const NRSMat<T> NRSMat<T>::permuted(const NRPerm<int> &p, const bool inverse) const
 {
 #ifdef DEBUG
 if(!p.is_valid()) laerror("invalid permutation of smatrix");
@@ -316,7 +316,8 @@ if(n!=(*this).size()) laerror("incompatible permutation and smatrix");
         if(this->getlocation() != cpu || p.getlocation() != cpu ) laerror("permutations can be done only in CPU memory");
 #endif
 NRSMat<T> r(n);
-for(int i=1; i<=n; ++i) {int pi = p[i]-1; r(i-1,i-1) = (*this)(pi,pi);}
+if(inverse) for(int i=1; i<=n; ++i) {int pi = p[i]-1; r(i-1,i-1) = (*this)(pi,pi);}
+else for(int i=1; i<=n; ++i) {int pi = p[i]-1; r(pi,pi) = (*this)(i-1,i-1);}
 return r;
 }
 
diff --git a/smat.h b/smat.h
index d6b4a09..3dd2c10 100644
--- a/smat.h
+++ b/smat.h
@@ -93,7 +93,7 @@ public:
 	NRSMat & operator=(const T &a);
 
 	//! permute matrix elements
-        const NRSMat permute(const NRPerm<int> &p) const;
+        const NRSMat permuted(const NRPerm<int> &p, const bool inverse=false) const;
 
 	inline int getcount() const {return count?*count:0;}
 
diff --git a/t.cc b/t.cc
index 25d5d59..4ba4c42 100644
--- a/t.cc
+++ b/t.cc
@@ -1989,7 +1989,7 @@ c=a+b;
 cout<<c;
 }
 
-if(1)
+if(0)
 {
 NRPerm<int> p;
 cin >>p;
@@ -1997,8 +1997,67 @@ int n=p.size();
 NRVec_from1<double> v(n);
 int i;
 for(i=1; i<=n; ++i) v[i]=10.*i;
-cout <<v.permute(p);
+cout <<v.permuted(p);
 }
 
+if(0)
+{
+CyclePerm<int> c;
+cin>>c;
+cout<<c<<endl;
+NRPerm<int> p(c);
+cout <<p;
+CyclePerm<int> cc(p);
+cout <<cc<<endl;
+}
+
+if(0)
+{
+int seed;
+int f=open("/dev/random",O_RDONLY);
+if(sizeof(int)!=read(f,&seed,sizeof(int))) laerror("cannot read /dev/random");
+close(f);
+srand(seed);
+int n;
+cin >>n;
+NRPerm<int> p(n);
+p.randomize();
+cout <<p;
+CyclePerm<int> cc(p);
+cout <<cc<<endl;
+NRPerm<int> pp(cc,n);
+cout <<pp;
+if(pp!=p) laerror("inconsistency");
+NRVec<double> v(n);
+for(int i=0; i<n; ++i) v[i]=10.*(i+1);
+NRVec<double> vv(v);
+v.permuteme(cc);
+cout <<v;
+NRVec<double>  vvv= vv.permuted(pp);
+cout<<vvv;
+cout<<"error "<<(v-vvv).norm()<<endl;
+}
+
+if(1)
+{
+int seed;
+int f=open("/dev/random",O_RDONLY);
+if(sizeof(int)!=read(f,&seed,sizeof(int))) laerror("cannot read /dev/random");
+close(f);
+srand(seed);
+int n;
+cin >>n;
+NRVec<double> v(n);
+v.randomize(1.);
+NRVec<double> vv(v);
+NRPerm<int> p(n);
+vv.sort(0,p);
+NRVec<double> vvv=v.permuted(p,true);
+NRVec<double> v4=vv.permuted(p,false);
+cout<<v<<vv;
+cout<<vvv<<v4<<p;
+cout <<"error "<<(vv-vvv).norm() <<" "<<(v-v4).norm()<<endl;
+
+}
 
 }
diff --git a/vec.cc b/vec.cc
index 1a758c0..17cce0e 100644
--- a/vec.cc
+++ b/vec.cc
@@ -815,6 +815,15 @@ int NRVec<T>::sort(int direction, int from, int to, int *perm) {
 	else return memqsort<0, NRVec<T>, int, int>(*this, perm, from, to);
 }
 
+template<typename T>
+int NRVec<T>::sort(int direction, NRPerm<int> &perm)
+{
+if(nn!=perm.size()) laerror("incompatible vector and permutation");
+perm.identity();
+int r=sort(direction,0,nn-1,&perm[1]);
+return r;
+}
+
 template<>
 NRVec<std::complex<double> > complexify(const NRVec<double> &rhs) {
 	NRVec<std::complex<double> > r(rhs.size(), rhs.getlocation());
@@ -834,7 +843,7 @@ NRVec<std::complex<double> > complexify(const NRVec<double> &rhs) {
 }
 
 template<typename T>
-const NRVec<T> NRVec<T>::permute(const NRPerm<int> &p) const
+const NRVec<T> NRVec<T>::permuted(const NRPerm<int> &p, const bool inverse) const
 {
 #ifdef DEBUG
 if(!p.is_valid()) laerror("invalid permutation of vector");
@@ -845,10 +854,34 @@ if(n!=(*this).size()) laerror("incompatible permutation and vector");
         if(this->getlocation() != cpu || p.getlocation() != cpu ) laerror("permutations can be done only in CPU memory");
 #endif
 NRVec<T> r(n);
-for(int i=1; i<=n; ++i) r[i-1] = v[p[i]-1];
+if(inverse) for(int i=1; i<=n; ++i) r[i-1] = v[p[i]-1];
+else for(int i=1; i<=n; ++i) r[p[i]-1] = v[i-1];
 return r;
 }
 
+
+template<typename T>
+void NRVec<T>::permuteme(const CyclePerm<int> &p)
+{
+#ifdef DEBUG
+if(!p.is_valid()) laerror("invalid permutation of vector");
+#endif
+if(p.max()>nn) laerror("incompatible permutation and vector");
+#ifdef CUDALA
+        if(this->getlocation() != cpu || p.getlocation() != cpu ) laerror("permutations can be done only in CPU memory");
+#endif
+copyonwrite();
+for(int cycle=1; cycle<=p.size(); ++cycle)
+	{
+	int length= p[cycle].size();
+	if(length<=1) continue; //trivial cycle
+	T tmp = v[p[cycle][length]-1];
+	for(int i=length; i>1; --i) v[p[cycle][i]-1] = v[p[cycle][i-1]-1];
+	v[p[cycle][1]-1] = tmp;
+	}
+}
+
+
 /***************************************************************************//**
  * forced instantization in the corespoding object file
  ******************************************************************************/
@@ -911,6 +944,38 @@ INSTANTIZE_DUMMY(std::complex<unsigned long long>)
 INSTANTIZE_DUMMY(std::complex<std::complex<double> >)
 INSTANTIZE_DUMMY(std::complex<std::complex<float> >)
 
+
+
+//also not supported on gpu
+#define INSTANTIZE_NONCOMPLEX(T) \
+template<>\
+const T NRVec<T>::max() const\
+{\
+if(nn==0) return 0;\
+T m=v[0];\
+for(int i=1; i<nn; ++i) if(v[i]>m) m=v[i];\
+return m;\
+}\
+\
+template<>\
+const T NRVec<T>::min() const\
+{\
+if(nn==0) return 0;\
+T m=v[0];\
+for(int i=1; i<nn; ++i) if(v[i]<m) m=v[i];\
+return m;\
+}\
+
+
+
+INSTANTIZE_NONCOMPLEX(char)
+INSTANTIZE_NONCOMPLEX(short)
+INSTANTIZE_NONCOMPLEX(int)
+INSTANTIZE_NONCOMPLEX(long)
+INSTANTIZE_NONCOMPLEX(long long)
+INSTANTIZE_NONCOMPLEX(float)
+INSTANTIZE_NONCOMPLEX(double)
+
 template class NRVec<double>;
 template class NRVec<std::complex<double> >;
 template class NRVec<char>;
diff --git a/vec.h b/vec.h
index 5c40609..39d9473 100644
--- a/vec.h
+++ b/vec.h
@@ -271,7 +271,8 @@ public:
         };
 
 	//! permute vector elements
-	const NRVec permute(const NRPerm<int> &p) const;
+	const NRVec permuted(const NRPerm<int> &p, const bool inverse=false) const;
+	void permuteme(const CyclePerm<int> &p); //in place
 
 	//! compute the sum of the absolute values of the elements of this vector 
 	inline const typename LA_traits<T>::normtype asum() const;
@@ -318,6 +319,12 @@ public:
 	//! determine the minimal element (in the absolute value) of this vector 
 	inline const T amin() const;
 
+        //! determine the maximal element of this vector
+        const T max() const;
+        //! determine the minimal element of this vector
+        const T min() const;
+
+
 	//! routine for formatted output 
 	void fprintf(FILE *f, const char *format, const int modulo) const;
 	//! routine for unformatted output 
@@ -355,6 +362,7 @@ public:
 
 	//! sort by default in ascending order and return the parity of corresponding permutation resulting to this order
 	int sort(int direction = 0, int from = 0, int to = -1, int *perm = NULL);
+	int sort(int direction, NRPerm<int> &perm);
 
 	//! apply given function to each element
 	NRVec& call_on_me(T (*_F)(const T &) ){
@@ -1082,6 +1090,8 @@ void NRVec<T>::moveto(const GPUID dest) {
 }
 #endif
 
+
+
 /***************************************************************************//**
  * adds a real scalar value \f$\alpha\f$ to all elements of this real vector \f$\vec{x}\f$
  * \f[\vec{x}_i\leftarrow\vec{x}_i+\alpha\f]