/* vim: set ts=8 sw=8 sts=8 noexpandtab cindent: */ /******************************************************************************* LA: linear algebra C++ interface library Copyright (C) 2008 Jiri Pittner or complex versions written by Roman Curik This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . *******************************************************************************/ #include "mat.h" #include #include #include #include #include #include #include #include #include "nonclass.h" namespace LA { /***************************************************************************//** * implements direct sum with a given matrix \f$B\f$ via storesubmatrix() * @param[in] rhs input matrix \f$B\f$ * @return result of the computation (new instance of NRMat) * @see submatrix() ******************************************************************************/ template const NRMat NRMat::oplus(const NRMat &rhs) const { // special cases if(nn == 0 && mm == 0) return rhs; if(rhs.nn == 0 && rhs.mm == 0) return *this; SAME_LOC(*this, rhs); NRMat ret(nn + rhs.nn, mm + rhs.mm, getlocation()); ret.clear(); ret.storesubmatrix(0, 0, *this); ret.storesubmatrix(nn, mm, rhs); return ret; } /***************************************************************************//** * implements direct product with a given matrix \f$B\f$ * @param[in] rhs input matrix \f$B\f$ * @return result of the computation (new instance of NRMat) ******************************************************************************/ template const NRMat NRMat::otimes(const NRMat &rhs, bool reversecolumns) const { // special cases if(nn == 0 && mm == 0) return *this; if(rhs.nn == 0 && rhs.mm == 0) return rhs; NRMat r((T)0, nn*rhs.nn, mm*rhs.mm); int i,j,k,l; if(reversecolumns){ for(i=0;iT * @param[in] i row index starting from zero * @param[in] l consider this value as the count of columns * @return extracted elements as a NRVec object ******************************************************************************/ template const NRVec NRMat::row(const int i, int l) const { #ifdef DEBUG if(i < 0 || i >= nn) laerror("illegal index"); #endif if(l < 0) l = mm; NRVec r(l); LA_traits::copy(&r[0], #ifdef MATPTR v[i] #else v + i*(size_t)l #endif , l); return r; } /***************************************************************************//** * routine for raw output * @param[in] fd file descriptor for output * @param[in] dim number of elements intended for output * @param[in] transp reserved * @see NRVec::put() ******************************************************************************/ template void NRMat::put(int fd, bool dim, bool transp) const { #ifdef CUDALA if(location != cpu) { NRMat tmp = *this; tmp.moveto(cpu); tmp.put(fd, dim, transp); return; } #endif errno = 0; if(dim){ if(sizeof(int) != write(fd,&(transp?mm:nn),sizeof(int))) laerror("write failed"); if(sizeof(int) != write(fd,&(transp?nn:mm),sizeof(int))) laerror("write failed"); } if(transp){ //not particularly efficient for(int j=0; j::put(fd, #ifdef MATPTR v[i][j] #else v[i*(size_t)mm+j] #endif ,dim ,transp); } } }else{ LA_traits::multiput((size_t)nn*(size_t)mm,fd, #ifdef MATPTR v[0] #else v #endif ,dim); } } /***************************************************************************//** * routine for raw input * @param[in] fd file descriptor for input * @param[in] dim number of elements intended for input, for dim=0 perform copyonwrite * @param[in] transp reserved * @see NRVec::get(), copyonwrite() ******************************************************************************/ template void NRMat::get(int fd, bool dim, bool transp){ #ifdef CUDALA if(location != cpu){ NRMat tmp; tmp.moveto(cpu); tmp.get(fd, dim, transp); tmp.moveto(getlocation()); *this = tmp; return; } #endif int nn0, mm0; errno = 0; if(dim){ if(sizeof(int) != read(fd, &nn0, sizeof(int))) laerror("read failed"); if(sizeof(int) != read(fd, &mm0, sizeof(int))) laerror("read failed"); if(transp) resize(mm0, nn0); else resize(nn0, mm0); }else{ copyonwrite(); } if(transp){ for(register int j=0; j::get(fd, #ifdef MATPTR v[i][j] #else v[i*(size_t)mm+j] #endif ,dim,transp); } } }else{ LA_traits::multiget((size_t)nn*(size_t)mm,fd, #ifdef MATPTR v[0] #else v #endif ,dim); } } /***************************************************************************//** * assigns a scalar value of general type T to the diagonal elements of this * matrix of general type T * @param[in] a scalar value of type T * @return reference to the modified matrix ******************************************************************************/ template NRMat& NRMat::operator=(const T &a) { NOT_GPU(*this); const int n2 = nn*nn; copyonwrite(); #ifdef DEBUG if(nn != mm) laerror("nonsquare matrix"); #endif #ifdef MATPTR memset(v[0], 0, n2*sizeof(T)); for(register int i=0; i < nn; i++) v[i][i] = a; #else memset(v, 0, n2*sizeof(T)); for(register int i=0; i < n2; i += nn + 1) v[i] = a; #endif return *this; } /***************************************************************************//** * assigns a double-precision real scalar value to the diagonal elements of this * double-precision real matrix * @param[in] a double-precision real scalar value * @return reference to the modified matrix ******************************************************************************/ template <> NRMat& NRMat::operator=(const double &a){ const int n2 = nn*nn; copyonwrite(); #ifdef DEBUG if(nn != mm) laerror("nonsquare matrix"); #endif #ifdef CUDALA if(location == cpu){ #endif #ifdef MATPTR memset(v[0], 0, n2*sizeof(double)); for(register int i=0; i< nn; i++) v[i][i] = a; #else const double n = 0.; //set all matrix elements equal to zero cblas_dcopy(n2, &n, 0, v, 1); //update the diagonal elements cblas_dcopy(nn, &a, 0, v, nn + 1); #endif #ifdef CUDALA }else{ smart_gpu_set(n2, 0.0, v, 1); smart_gpu_set(nn, a, v, nn + 1); } #endif return *this; } /***************************************************************************//** * assigns a double-precision complex scalar value to the diagonal elements of this * double-precision complex matrix * @param[in] a double-precision complex scalar value * @return reference to the modified matrix ******************************************************************************/ template <> NRMat >& NRMat >::operator=(const std::complex &a){ const int n2 = nn*nn; copyonwrite(); #ifdef DEBUG if(nn != mm) laerror("nonsquare matrix"); #endif #ifdef CUDALA if(location == cpu){ #endif #ifdef MATPTR memset(v[0], 0, n2*sizeof(std::complex)); for(register int i=0; i< nn; i++) v[i][i] = a; #else //set all matrix elements equal to zero cblas_zcopy(n2, &CZERO, 0, v, 1); //update the diagonal elements cblas_zcopy(nn, &a, 0, v, nn + 1); #endif #ifdef CUDALA }else{ smart_gpu_set(n2, CZERO, v, 1); smart_gpu_set(nn, a, v, nn + 1); } #endif return *this; } /***************************************************************************//** * adds a double-precision real scalar value to the diagonal elements of this * double-precision real matrix * @param[in] a double-precision real scalar value * @return reference to the modified matrix ******************************************************************************/ template <> NRMat & NRMat::operator+=(const double& a) { copyonwrite(); #ifdef DEBUG if(nn != mm) laerror("nonsquare matrix"); #endif #ifdef CUDALA if(location == cpu){ #endif #ifdef MATPTR for(register int i=0; i < nn; i++) v[i][i] += a; #else cblas_daxpy(nn, 1.0, &a, 0, *this, nn + 1); #endif #ifdef CUDALA }else{ double *d = gpuputdouble(a); cublasDaxpy(nn, 1.0, d, 0, *this, nn+1); TEST_CUBLAS("cublasDaxpy"); gpufree(d); } #endif return *this; } /***************************************************************************//** * adds a double-precision complex scalar value to the diagonal elements of this * double-precision complex matrix * @param[in] a double-precision complex scalar value * @return reference to the modified matrix ******************************************************************************/ template <> NRMat > & NRMat >::operator+=(const std::complex& a) { copyonwrite(); #ifdef DEBUG if(nn != mm) laerror("nonsquare matrix"); #endif #ifdef CUDALA if(location == cpu){ #endif #ifdef MATPTR for(register int i=0; i < nn; i++) v[i][i] += a; #else cblas_zaxpy(nn, &CONE, &a, 0, *this, nn + 1); #endif #ifdef CUDALA }else{ std::complex* d = gpuputcomplex(a); cublasZaxpy(nn, CUMONE, (cuDoubleComplex*)d, 0, (cuDoubleComplex*)v, nn+1); TEST_CUBLAS("cublasDaxpy"); gpufree(d); } #endif return *this; } /***************************************************************************//** * subtracts a double-precision real scalar value from the diagonal elements of this * double-precision real matrix * @param[in] a double-precision real scalar value * @return reference to the modified matrix ******************************************************************************/ template <> NRMat& NRMat::operator-=(const double& a) { copyonwrite(); #ifdef DEBUG if(nn != mm) laerror("nonsquare matrix"); #endif #ifdef CUDALA if(location == cpu){ #endif #ifdef MATPTR for(register int i=0; i< nn; i++) v[i][i] -= a; #else cblas_daxpy(nn, -1.0, &a, 0, *this, nn+1); #endif #ifdef CUDALA }else{ double *d = gpuputdouble(a); cublasDaxpy(nn, -1.0, d, 0, *this, nn+1); TEST_CUBLAS("cublasDaxpy"); gpufree(d); } #endif return *this; } /***************************************************************************//** * subtracts a double-precision complex scalar value from the diagonal elements of this * double-precision complex matrix * @param[in] a double-precision complex scalar value * @return reference to the modified matrix ******************************************************************************/ template <> NRMat >& NRMat >::operator-=(const std::complex& a) { copyonwrite(); #ifdef DEBUG if(nn != mm) laerror("nonsquare matrix"); #endif #ifdef CUDALA if(location == cpu){ #endif #ifdef MATPTR for(register int i=0; i < nn; i++) v[i][i] -= a; #else cblas_zaxpy(nn, &CMONE, &a, 0, *this, nn + 1); #endif #ifdef CUDALA }else{ std::complex* d = gpuputcomplex(a); cublasZaxpy(nn, CUMONE, (cuDoubleComplex*)d, 0, (cuDoubleComplex*)v, nn+1); TEST_CUBLAS("cublasDaxpy"); gpufree(d); } #endif return *this; } /***************************************************************************//** * add a scalar value of type T to the diagonal elements of this * matrix of general type T * @return reference to the modified matrix ******************************************************************************/ template NRMat& NRMat::operator+=(const T &a) { NOT_GPU(*this); copyonwrite(); #ifdef DEBUG if(nn != mm) laerror("nonsquare matrix"); #endif #ifdef MATPTR for(register int i=0; i < nn; i++) v[i][i] += a; #else for(register int i=0; i < nn*nn; i += nn+1) v[i] += a; #endif return *this; } /***************************************************************************//** * subtracts a scalar value of type T from the diagonal elements of this * matrix of general type T * @return reference to the modified matrix ******************************************************************************/ template NRMat & NRMat::operator-=(const T &a) { NOT_GPU(*this); copyonwrite(); #ifdef DEBUG if(nn != mm) laerror("nonsquare matrix"); #endif #ifdef MATPTR for(register int i=0; i< nn; i++) v[i][i] -= a; #else for(register int i=0; i< nn*nn; i+=nn+1) v[i] -= a; #endif return *this; } /***************************************************************************//** * implements unary minus operator for this double-recision real matrix * @return modified copy of this matrix ******************************************************************************/ template <> const NRMat NRMat::operator-() const { const size_t nm = (size_t)nn*mm; NRMat result(nn, mm, getlocation()); #ifdef CUDALA if(location == cpu) { #endif #ifdef MATPTR for(register size_t i=0; i const NRMat > NRMat >::operator-() const { const size_t nm = (size_t)nn*mm; NRMat > result(nn, mm, getlocation()); #ifdef CUDALA if(location == cpu) { #endif #ifdef MATPTR for(register size_t i=0; i)); cblas_zscal(nm, &CMONE, result.v, 1); #endif #ifdef CUDALA }else{ cublasZcopy(nm, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)result.v, 1); TEST_CUBLAS("cublasZcopy"); cublasZscal(nm, CUMONE, (cuDoubleComplex*)result.v, 1); TEST_CUBLAS("cublasZscal"); } #endif return result; } /***************************************************************************//** * implements unary minus operator for this matrix of general type T * @return modified copy of this matrix ******************************************************************************/ template const NRMat NRMat::operator-() const { NOT_GPU(*this); NRMat result(nn, mm, getlocation()); #ifdef MATPTR for(register size_t i=0; i<(size_t)nn*mm; i++) result.v[0][i] = -v[0][i]; #else for(register size_t i=0; i<(size_t)nn*mm; i++) result.v[i] = -v[i]; #endif return result; } // direct sum template const NRMat NRMat::operator&(const NRMat &b) const { SAME_LOC(*this, b); NRMat result((T)0, nn + b.nn, mm + b.mm, getlocation()); if(!LA_traits::is_plaindata()) laerror("only implemented for plain data"); #ifdef CUDALA if(location == cpu){ #endif for(register int i=0; i const NRMat NRMat::operator|(const NRMat &b) const { NRMat result(nn*b.nn, mm*b.mm); for (int i=0; iT * @return summed columns in a form of a vector ******************************************************************************/ template const NRVec NRMat::csum() const { NOT_GPU(*this); NRVec result(nn, getlocation()); T sum; for(register int i=0; i const NRVec NRMat::csum() const { NRVec result(nn, getlocation()); result = 0.0; #ifdef CUDALA if(location == cpu){ #endif for(register int i=0; i const NRVec > NRMat >::csum() const { NRVec > result(nn, getlocation()); result = 0.0; #ifdef CUDALA if(location == cpu){ #endif for(register int i=0;iT * @return summed rows in a form of a vector ******************************************************************************/ template const NRVec NRMat::rsum() const { NOT_GPU(*this); NRVec result(mm, getlocation()); T sum; for(register int i=0; i const NRVec NRMat::rsum() const { NRVec result(mm, getlocation()); result = 0.0; #ifdef CUDALA if(location == cpu){ #endif for(register int i=0;i const NRVec > NRMat >::rsum() const { NRVec > result(mm, getlocation()); result = 0.0; #ifdef CUDALA if(location == cpu){ #endif for(register int i=0;i const NRMat NRMat::submatrix(const int fromrow, const int torow, const int fromcol, const int tocol) const { #ifdef DEBUG if(fromrow<0 || fromrow>=nn|| torow<0 || torow>=nn || fromcol<0 || fromcol>=mm || tocol<0 || tocol>=mm || fromrow>torow || fromcol>tocol){ laerror("invalid submatrix specification"); } #endif const int n = torow - fromrow + 1; const int m = tocol - fromcol + 1; NRMat r(n, m, getlocation()); if(!LA_traits::is_plaindata()) laerror("only implemented for plain data"); #ifdef CUDALA if(location == cpu){ #endif for(register int i=fromrow; i<=torow; ++i){ #ifdef MATPTR memcpy(r.v[i - fromrow], v[i] + fromcol, m*sizeof(T)); #else memcpy(r.v+(i - fromrow)*m, v + i*(size_t)mm + fromcol, m*sizeof(T)); #endif } #ifdef CUDALA }else{ if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem"); for(register int i=fromrow; i<=torow; ++i){ cublasScopy(m*sizeof(T)/sizeof(float), (const float *)(v + i*(size_t)mm + fromcol), 1, (float*)(r.v + (i - fromrow)*m), 1); TEST_CUBLAS("cublasScopy"); } } #endif return r; } template const NRMat NRMat::submatrix(const NRVec &rows, const NRVec &cols) const { NOT_GPU(*this); const int n = rows.size(); const int m = cols.size(); NRMat r(n,m); for(int i=0; i=nn) laerror("bad row index in submatrix"); for(int j=0; j=mm) laerror("bad col index in submatrix"); r(i,j) = (*this)(ii,jj); } } return r; } /***************************************************************************//** * places given matrix as submatrix at given position * @param[in] fromrow row-coordinate of top left corner * @param[in] fromcol col-coordinate of top left corner * @param[in] rhs input matrix ******************************************************************************/ template void NRMat::storesubmatrix(const int fromrow, const int fromcol, const NRMat &rhs) { const int tocol = fromcol + rhs.ncols() - 1; const int torow = fromrow + rhs.nrows() - 1; #ifdef DEBUG if(fromrow<0 || fromrow>=nn || torow>=nn || fromcol<0 || fromcol>=mm || tocol>=mm) laerror("bad indices in storesubmatrix"); #endif SAME_LOC(*this, rhs); if(!LA_traits::is_plaindata()) laerror("only implemented for plain data"); const int m = tocol - fromcol + 1; for(register int i = fromrow; i <= torow; ++i){ #ifdef CUDALA if(location == cpu){ #endif #ifdef MATPTR memcpy(v[i] + fromcol, rhs.v[i - fromrow], m*sizeof(T)); #else memcpy(v + i*(size_t)mm + fromcol, rhs.v + (i - fromrow)*m, m*sizeof(T)); #endif #ifdef CUDALA }else{ if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem"); cublasScopy(m*sizeof(T)/sizeof(float), (const float *) (rhs.v + (i - fromrow)*m), 1, (float *)(v + i*(size_t)mm + fromcol), 1); } #endif } } template void NRMat::storesubmatrix(const NRVec &rows, const NRVec &cols, const NRMat &rhs) { NOT_GPU(*this); const int n = rows.size(); const int m = cols.size(); if(rhs.nrows()!=n || rhs.ncols()!=m) laerror("incompatible dimensions in storesubmatrix"); for(int i=0; i=nn) laerror("bad row index in storesubmatrix"); for(int j=0; j=mm) laerror("bad col index in storesubmatrix"); (*this)(ii,jj) = rhs(i,j); } } } /***************************************************************************//** * compute matrix transposition for a principal leading minor * @param[in] _n order of the leading minor * @return reference to the modified matrix ******************************************************************************/ template NRMat& NRMat::transposeme(const int _n) { const int n = (n <= 0)?nn:_n;//!< transpose the entire matrix #ifdef DEBUG if (n==nn && nn != mm || n>mm || n>nn ) laerror("NRMat::transposeme() - invalid parameter n. Non-square matrix?"); #endif #ifdef CUDALA if(location == cpu){ #endif copyonwrite(); for(register int i=1; i const typename LA_traits::normtype NRMat::nonsymmetry() const { #ifdef DEBUG if (nn != mm) laerror("NRMat:nonsymmetry() invalid for non-square matrix"); #endif typename LA_traits::normtype sum = 0; #ifdef CUDALA if(location == cpu){ #endif for(register int i=1; i const double NRMat >::nonhermiticity() const { #ifdef DEBUG if (nn != mm) laerror("NRMat:nonsymmetry() invalid for non-square matrix"); #endif double sum = 0; std::complex tmp; #ifdef CUDALA if(location == cpu){ #endif for(register int i=1; i (v[i][j].real()-v[j][i].real(),v[i][j].imag()+v[j][i].imag()); #else register int a, b; a = i*(size_t)mm + j; b = j*(size_t)mm + i; tmp = std::complex (v[a].real() - v[b].real(), v[a].imag()+v[b].imag()); #endif sum += tmp.real()*tmp.real()+tmp.imag()*tmp.imag(); } } #ifdef CUDALA }else{ laerror("nonsymmetry not implemented on GPU yet"); } #endif return sum; } template <> const double NRMat >::nonsymmetry() const { #ifdef DEBUG if (nn != mm) laerror("NRMat:nonsymmetry() invalid for non-square matrix"); #endif double sum = 0; std::complex tmp; #ifdef CUDALA if(location == cpu){ #endif for(register int i=1; i NRMat >::NRMat(const NRMat &rhs, bool imagpart): nn(rhs.nrows()), mm(rhs.ncols()), count(new int(1)) { const size_t nn_mm = (size_t)nn*mm; #ifdef CUDALA if(location == cpu){ #endif #ifdef MATPTR v = new std::complex*[n]; v[0] = new std::complex[nn_mm]; for(register int i=1; i)); cblas_dcopy(nn_mm, &rhs[0][0], 1, ((double *)v[0]) + (imagpart?1:0), 2); #else v = new std::complex[nn_mm]; memset(v, 0, nn_mm*sizeof(std::complex)); cblas_dcopy(nn_mm, &rhs[0][0], 1, ((double *)v) + (imagpart?1:0), 2); #endif #ifdef CUDALA }else{ v = (std::complex*)gpualloc(sizeof(std::complex)*nn_mm); std::complex *_val = gpuputcomplex(CZERO); cublasZcopy(nn_mm, (cuDoubleComplex*)_val, 0, (cuDoubleComplex*)v, 1); TEST_CUBLAS("cublasZcopy"); gpufree(_val); cublasDcopy(nn_mm, (double*)(&rhs[0][0]), 1, ((double*)v) + (imagpart?1:0), 2); TEST_CUBLAS("cublasDcopy"); } #endif } /***************************************************************************//** * create double-precision matrix from complex double-precision matrix \f$A\f$ * @param[in] rhs complex double-precision matrix \f$A\f$ * @param[in] imagpart flag indicating whether the matrix \f$A\f$ should be taken as the real * or imaginary part of the input complex matrix ******************************************************************************/ template<> NRMat::NRMat(const NRMat > &rhs, bool imagpart): nn(rhs.nrows()), mm(rhs.ncols()), count(new int(1)) { const size_t nn_mm = (size_t) nn*mm; #ifdef CUDALA if(location == cpu){ #endif #ifdef MATPTR v = new double*[n]; v[0] = new double[nn_mm]; for(register int i=1; i void NRMat::fprintf(FILE *file, const char *format, const int modulo) const { #ifdef CUDALA if(location == cpu){ #endif lawritemat(file, (const T*)(*this), nn, mm, format, 2, modulo, 0); #ifdef CUDALA }else{ NRMat tmp = *this; tmp.moveto(cpu); lawritemat(file, (const T*)(tmp), nn, mm, format, 2, modulo, 0); } #endif } /***************************************************************************//** * input of a matrix of general type via lawritemat ******************************************************************************/ template void NRMat::fscanf(FILE *f, const char *format) { T *p; NRMat *tmp; int n(0), m(0); if (::fscanf(f, "%d %d", &n, &m) != 2) laerror("cannot read matrix dimensions"); #ifdef CUDALA if(location == cpu){ p = *this; }else{ tmp = new NRMat(n, m, this->location); p = *tmp; } #endif resize(n, m); for(int i=0; i types //----------------------------------------------------------------------------- /***************************************************************************//** * for a given real matrix \f$A\f$ compute \f$A^\mathrm{T}A\f$ * @return real NRSMat object because of the symmetry of \f$A^\mathrm{T}A\f$ ******************************************************************************/ template<> const NRSMat NRMat::transposedtimes() const { int i(0), j(0); NRSMat r(mm, getlocation());//!< resulting matrix has mm rows #ifdef CUDALA if(location == cpu){ #endif for(i=0; ilocation); } #endif return r; } /***************************************************************************//** * for a given complex matrix \f$A\f$ compute \f$A^\dagger{}A\f$ * @return complex NRSMat object because of the hermiticity of \f$A^\dagger{}A\f$ ******************************************************************************/ template<> const NRSMat > NRMat >::transposedtimes() const { int i(0), j(0); NRSMat > r(mm, getlocation()); #ifdef CUDALA if(location == cpu){ #endif for(i=0; i*> (&val)); } } r.moveto(this->location); } #endif return r; } /***************************************************************************//** * for a given matrix \f$A\f$ (general type) compute \f$A^\mathrm{T}A\f$ * @return NRSMat object because of the symmetry of the result ******************************************************************************/ template const NRSMat NRMat::transposedtimes() const { int i(0), j(0); NOT_GPU(*this); NRSMat r(mm, getlocation()); for(i=0; i const NRSMat NRMat::timestransposed() const { int i(0), j(0); NRSMat r(nn, getlocation());//!< resulting matrix has nn rows #ifdef CUDALA if(location == cpu){ #endif for(i=0; ilocation); } #endif return r; } /***************************************************************************//** * for a given complex matrix \f$A\f$ compute \f$AA^\dagger{}\f$ * @return complex NRSMat object because of the hermiticity of \f$AA^\dagger{}\f$ ******************************************************************************/ template<> const NRSMat > NRMat >::timestransposed() const { int i(0), j(0); NRSMat > r(nn, getlocation()); #ifdef CUDALA if(location == cpu){ #endif for(i=0; i*> (&val)); } } r.moveto(this->location); } #endif return r; } /***************************************************************************//** * for a given matrix \f$A\f$ (general type) compute \f$A^\mathrm{T}A\f$ * @return NRSMat object because of the symmetry of the result ******************************************************************************/ template const NRSMat NRMat::timestransposed() const { int i(0), j(0); NOT_GPU(*this); NRSMat r(nn); for(i=0; i void NRMat::randomize(const double &x) { #ifdef CUDALA if(location == cpu){ #endif for(register int i=0; i tmp(nn, mm, cpu); double *tmp_data = tmp; for(register size_t i=0; i<(size_t)nn*mm; ++i){ tmp_data[i] = x*RANDDOUBLESIGNED(); } tmp.moveto(this->location); *this |= tmp; } #endif } /***************************************************************************//** * fill given complex matrix with random numbers * real/imaginary components are generated independently * @param[in] x generate random numbers from the interval [0, x] ******************************************************************************/ template<> void NRMat >::randomize(const double &x) { #ifdef CUDALA if(location == cpu){ #endif for(register int i=0; i(re, im); } } #ifdef CUDALA }else{ NRMat > tmp(nn, mm, cpu); std::complex *tmp_data = tmp; for(register size_t i=0; i<(size_t)nn*mm; ++i){ const double re = x*RANDDOUBLESIGNED(); const double im = x*RANDDOUBLESIGNED(); tmp_data[i] = std::complex(re, im); } tmp.moveto(this->location); *this |= tmp; } #endif } /***************************************************************************//** * scale real matrix with a real factor * @param[in] a scaling factor * @return reference to the modified matrix ******************************************************************************/ template<> NRMat& NRMat::operator*=(const double &a) { copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif cblas_dscal((size_t)nn*mm, a, *this, 1); #ifdef CUDALA }else{ cublasDscal((size_t)nn*mm, a, v, 1); TEST_CUBLAS("cublasDscal"); } #endif return *this; } /***************************************************************************//** * scale complex matrix with a complex factor * @param[in] a scaling factor * @return reference to the modified matrix ******************************************************************************/ template<> NRMat > & NRMat >::operator*=(const std::complex &a) { copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif cblas_zscal((size_t)nn*mm, &a, (*this)[0], 1); #ifdef CUDALA }else{ const cuDoubleComplex fac = *(reinterpret_cast (&a)); cublasZscal((size_t)nn*mm, fac, (cuDoubleComplex *)v, 1); TEST_CUBLAS("cublasZscal"); } #endif return *this; } /***************************************************************************//** * scale matrix of type T with a factor * @param[in] a scaling factor * @return reference to the modified matrix ******************************************************************************/ template NRMat & NRMat::operator*=(const T &a) { NOT_GPU(*this); copyonwrite(); #ifdef MATPTR for(register size_t i=0; i< (size_t)nn*mm; i++) v[0][i] *= a; #else for(register size_t i=0; i< (size_t)nn*mm; i++) v[i] *= a; #endif return *this; } /***************************************************************************//** * add a given real matrix \f$A\f$ to the current real matrix * @param[in] rhs matrix \f$A\f$ * @return reference to the modified matrix ******************************************************************************/ template<> NRMat & NRMat::operator+=(const NRMat &rhs) { #ifdef DEBUG if (nn != rhs.nn || mm != rhs.mm) laerror("incompatible matrices"); #endif SAME_LOC(*this, rhs); copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif cblas_daxpy((size_t)nn*mm, 1.0, rhs, 1, *this, 1); #ifdef CUDALA }else{ cublasDaxpy((size_t)nn*mm, 1.0, rhs, 1, v, 1); TEST_CUBLAS("cublasDaxpy"); } #endif return *this; } /***************************************************************************//** * add a given complex matrix \f$A\f$ to the current complex matrix * @param[in] rhs complex matrix \f$A\f$ * @return reference to the modified matrix ******************************************************************************/ template<> NRMat > & NRMat >::operator+=(const NRMat< std::complex > &rhs) { #ifdef DEBUG if (nn != rhs.nn || mm != rhs.mm) laerror("incompatible matrices"); #endif SAME_LOC(*this, rhs); copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif cblas_zaxpy((size_t)nn*mm, &CONE, rhs[0], 1, (*this)[0], 1); #ifdef CUDALA }else{ cublasZaxpy((size_t)nn*mm, CUONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1); } #endif return *this; } /***************************************************************************//** * subtract a given real matrix \f$A\f$ from the current real matrix * @param[in] rhs matrix \f$A\f$ * @return reference to the modified matrix ******************************************************************************/ template<> NRMat & NRMat::operator-=(const NRMat &rhs) { #ifdef DEBUG if (nn != rhs.nn || mm != rhs.mm) laerror("incompatible matrices"); #endif SAME_LOC(*this,rhs); copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif cblas_daxpy((size_t)nn*mm, -1.0, rhs, 1, *this, 1); #ifdef CUDALA }else{ cublasDaxpy((size_t)nn*mm, -1.0, rhs, 1, v, 1); } #endif return *this; } /***************************************************************************//** * subtract a given complex matrix \f$A\f$ from the current complex matrix * @param[in] rhs matrix \f$A\f$ * @return reference to the modified matrix ******************************************************************************/ template<> NRMat< std::complex > & NRMat< std::complex >::operator-=(const NRMat< std::complex > &rhs) { #ifdef DEBUG if (nn != rhs.nn || mm != rhs.mm) laerror("incompatible matrices"); #endif SAME_LOC(*this, rhs); copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif cblas_zaxpy((size_t)nn*mm, &CMONE, rhs[0], 1, (*this)[0], 1); #ifdef CUDALA }else{ cublasZaxpy((size_t)nn*mm, CUMONE, (cuDoubleComplex*)(rhs[0]), 1, (cuDoubleComplex*)((*this)[0]), 1); } #endif return *this; } /***************************************************************************//** * add a given sparse real matrix \f$A\f$ stored in packed form to the current * real matrix * @param[in] rhs symmetric real matrix \f$A\f$ in packed form * @return reference to the modified matrix * @see NRSMat ******************************************************************************/ template<> NRMat & NRMat::operator+=(const NRSMat &rhs) { #ifdef DEBUG if (nn != rhs.nn || mm != rhs.nn) laerror("incompatible matrices"); #endif const double *p = rhs; SAME_LOC(*this, rhs); copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif for(register int i=0; i ******************************************************************************/ template<> NRMat< std::complex > & NRMat< std::complex >::operator+=(const NRSMat< std::complex > &rhs) { #ifdef DEBUG if (nn != rhs.nn || mm != rhs.nn) laerror("incompatible matrices"); #endif const std::complex *p = rhs; SAME_LOC(*this, rhs); copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif for(register int i=0; i ******************************************************************************/ template NRMat & NRMat::operator+=(const NRSMat &rhs) { #ifdef DEBUG if (nn != rhs.nn || mm != rhs.nn) laerror("incompatible matrices"); #endif const T *p = rhs; SAME_LOC(*this, rhs); NOT_GPU(*this); copyonwrite(); for(register int i=0; i ******************************************************************************/ template<> NRMat & NRMat::operator-=(const NRSMat &rhs) { #ifdef DEBUG if (nn != rhs.nn || mm != rhs.nn) laerror("incompatible matrices"); #endif const double *p = rhs; SAME_LOC(*this, rhs); copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif for(register int i=0; i ******************************************************************************/ template<> NRMat > & NRMat >::operator-=(const NRSMat > &rhs) { #ifdef DEBUG if (nn != rhs.nn || mm != rhs.nn) laerror("incompatible matrices"); #endif const std::complex *p = rhs; SAME_LOC(*this, rhs); copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif for(register int i=0; i ******************************************************************************/ template NRMat & NRMat::operator-=(const NRSMat &rhs) { #ifdef DEBUG if (nn != rhs.nn || mm != rhs.nn) laerror("incompatible matrices"); #endif SAME_LOC(*this, rhs); NOT_GPU(*this); const T *p = rhs; copyonwrite(); for(register int i=0; i const double NRMat::dot(const NRMat &rhs) const { #ifdef DEBUG if(nn != rhs.nn || mm != rhs.mm) laerror("incompatible matrices in NRMat::dot(const NRMat&)"); #endif double ret(0.0); SAME_LOC(*this, rhs); #ifdef CUDALA if(location == cpu){ #endif ret = cblas_ddot((size_t)nn*mm, (*this)[0], 1, rhs[0], 1); #ifdef CUDALA }else{ ret = cublasDdot((size_t)nn*mm, v, 1, rhs.v, 1); } #endif return ret; } /***************************************************************************//** * compute scalar product of this matrix \f$A\f$ with given complex matrix \f$B\f$ * i.e. determine \f$\sum_{i,j}A^{*}_{i,j}B_{i,j}\f$ * @param[in] rhs matrix \f$B\f$ * @return computed scalar product ******************************************************************************/ template<> const std::complex NRMat >::dot(const NRMat > &rhs) const { #ifdef DEBUG if(nn != rhs.nn || mm != rhs.mm) laerror("incompatible matrices in NRMat >::dot(const NRMat >&)"); #endif std::complex ret(0.0, 0.0); #ifdef CUDALA if(location == cpu){ #endif cblas_zdotc_sub((size_t)nn*mm, (*this)[0], 1, rhs[0], 1, &ret); #ifdef CUDALA }else{ cuDoubleComplex val = cublasZdotc((size_t)nn*mm, (cuDoubleComplex*)v, 1, (cuDoubleComplex*)(rhs.v), 1); ret = *(reinterpret_cast*> (&val)); } #endif return ret; } /***************************************************************************//** * compute product of this matrix \f$A\f$ with given real matrix \f$B\f$ * @param[in] rhs matrix \f$B\f$ * @return computed product by value ******************************************************************************/ template<> const NRMat NRMat::operator*(const NRMat &rhs) const { #ifdef DEBUG if(mm != rhs.nn) laerror("incompatible matrices in NRMat::operator*(const NRMat&)"); if(rhs.mm <= 0) laerror("illegal matrix dimension in gemm"); #endif SAME_LOC(*this, rhs); NRMat result(nn, rhs.mm, getlocation()); #ifdef CUDALA if(location == cpu){ #endif cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, nn, rhs.mm, mm, 1.0, *this, mm, rhs, rhs.mm, 0.0, result, rhs.mm); #ifdef CUDALA }else{ cublasDgemm('N', 'N', rhs.mm, nn, mm, 1.0, rhs, rhs.mm, *this, mm, 0.0, result, rhs.mm); } #endif return result; } /***************************************************************************//** * compute product of this matrix \f$A\f$ with given complex matrix \f$B\f$ * @param[in] rhs matrix \f$B\f$ * @return computed product by value ******************************************************************************/ template<> const NRMat< std::complex > NRMat< std::complex >::operator*(const NRMat< std::complex > &rhs) const { #ifdef DEBUG if(mm != rhs.nn) laerror("incompatible matrices in NRMat >::operator*(const NRMat >&)"); if(rhs.mm <= 0) laerror("illegal matrix dimension in gemm"); #endif SAME_LOC(*this, rhs); NRMat > result(nn, rhs.mm, getlocation()); #ifdef CUDALA if(location == cpu){ #endif cblas_zgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, nn, rhs.mm, mm, &CONE, (*this)[0], mm, rhs[0], rhs.mm, &CZERO, result[0], rhs.mm); #ifdef CUDALA }else{ cublasZgemm('N', 'N', rhs.mm, nn, mm, CUONE, (cuDoubleComplex*)rhs.v, rhs.mm, (cuDoubleComplex*)(this->v), mm, CUZERO, (cuDoubleComplex*)result.v, rhs.mm); } #endif return result; } /***************************************************************************//** * multiply this real matrix \f$A\f$ by diagonal real matrix \f$D\f$ from left * because of cuBlas implementation, \f$D\f$ is required to be placed in CPU memory * @param[in] rhs real vector represeting the diagonal of matrix \f$D\f$ ******************************************************************************/ template<> void NRMat::diagmultl(const NRVec &rhs) { #ifdef DEBUG if(nn != rhs.size()) laerror("incompatible matrices in NRMat::diagmultl(const NRVec&)"); #endif NOT_GPU(rhs); copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif for(register int i=0; i void NRMat< std::complex >::diagmultl(const NRVec< std::complex > &rhs) { #ifdef DEBUG if (nn != rhs.size()) laerror("incompatible matrices in NRMat >::diagmultl(const NRVec >&)"); #endif NOT_GPU(rhs); copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif for(register int i=0; i void NRMat::diagmultr(const NRVec &rhs) { #ifdef DEBUG if(mm != rhs.size()) laerror("incompatible matrices in NRMat::diagmultr(const NRVec&)"); #endif NOT_GPU(rhs); copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif for(register int i=0; i void NRMat< std::complex >::diagmultr(const NRVec< std::complex > &rhs) { #ifdef DEBUG if(mm != rhs.size()) laerror("incompatible matrices in NRMat >::diagmultr(const NRVec >&)"); #endif NOT_GPU(rhs); copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif for(register int i=0; i const NRMat NRMat::operator*(const NRSMat &rhs) const { #ifdef DEBUG if(mm != rhs.nrows()) laerror("incompatible matrices int NRMat::operator*(const NRSMat &)"); #endif SAME_LOC(*this, rhs); const int rhs_ncols = rhs.ncols(); NRMat result(nn, rhs_ncols, getlocation()); #ifdef CUDALA if(location == cpu){ #endif for(register int i=0; i const NRMat< std::complex > NRMat< std::complex >::operator*(const NRSMat< std::complex > &rhs) const { #ifdef DEBUG if(mm != rhs.nrows()) laerror("incompatible matrices int NRMat >::operator*(const NRSMat > &)"); #endif SAME_LOC(*this, rhs); const int rhs_ncols = rhs.ncols(); NRMat > result(nn, rhs_ncols, getlocation()); #ifdef CUDALA if(location == cpu){ #endif for(register int i=0; i NRMat& NRMat::conjugateme() { return *this; } /***************************************************************************//** * conjugate this complex matrix \f$A\f$, or leading minor of size n * @return reference to the modified matrix ******************************************************************************/ template<> NRMat >& NRMat >::conjugateme() { copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif cblas_dscal((size_t)mm*nn, -1.0, (double *)((*this)[0]) + 1, 2); #ifdef CUDALA }else{ cublasDscal((size_t)mm*nn, -1.0, (double *)(this->v) + 1, 2); } #endif return *this; } template<> NRMat >& NRMat >::conjugateme() { copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif cblas_sscal((size_t)mm*nn, -1.0, (float *)((*this)[0]) + 1, 2); #ifdef CUDALA }else{ cublasSscal((size_t)mm*nn, -1.0, (float *)(this->v) + 1, 2); } #endif return *this; } /***************************************************************************//** * compute transpose (optionally conjugated) of this real matrix \f$A\f$ * @param[in] conj conjugation flag, unused for real matrices * @return transposed (conjugated) matrix by value ******************************************************************************/ template<> const NRMat NRMat::transpose(bool conj) const { NRMat result(mm, nn, getlocation()); #ifdef CUDALA if(location == cpu){ #endif for(register int i=0; i const NRMat > NRMat >::transpose(bool conj) const { NRMat > result(mm, nn, getlocation()); #ifdef CUDALA if(location == cpu){ #endif for(register int i=0; i void NRMat::gemm(const double &beta, const NRMat &a, const char transa, const NRMat &b, const char transb, const double &alpha) { int k(tolower(transa)=='n'?a.mm:a.nn); #ifdef DEBUG int l(tolower(transa)=='n'?a.nn:a.mm); int kk(tolower(transb)=='n'?b.nn:b.mm); int ll(tolower(transb)=='n'?b.mm:b.nn); if (l!=nn || ll!=mm || k!=kk) laerror("incompatible matrices in NRMat::gemm(...)"); if(b.mm <=0 || mm<=0) laerror("illegal matrix dimension in gemm"); #endif SAME_LOC3(*this, a, b); if (alpha==0.0 && beta==1.0) return; copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif cblas_dgemm(CblasRowMajor, (tolower(transa)=='n' ? CblasNoTrans : CblasTrans), (tolower(transb)=='n' ? CblasNoTrans : CblasTrans), nn, mm, k, alpha, a, a.mm, b , b.mm, beta, *this , mm); #ifdef CUDALA }else{ cublasDgemm(transb, transa, mm, nn, k, alpha, b, b.mm, a, a.mm, beta, *this, mm); } #endif } template<> void NRMat >::gemm(const std::complex & beta, const NRMat > & a, const char transa, const NRMat > & b, const char transb, const std::complex & alpha) { int k(tolower(transa)=='n'?a.mm:a.nn); #ifdef DEBUG int l(tolower(transa)=='n'?a.nn:a.mm); int kk(tolower(transb)=='n'?b.nn:b.mm); int ll(tolower(transb)=='n'?b.mm:b.nn); if (l!=nn || ll!=mm || k!=kk) laerror("incompatible matrices in NRMat >::gemm(...)"); #endif if (alpha==CZERO && beta==CONE) return; copyonwrite(); cblas_zgemm(CblasRowMajor, (tolower(transa)=='n' ? CblasNoTrans : (tolower(transa)=='c'?CblasConjTrans:CblasTrans)), (tolower(transb)=='n' ? CblasNoTrans : (tolower(transb)=='c'?CblasConjTrans:CblasTrans)), nn, mm, k, &alpha, a , a.mm, b , b.mm, &beta, *this , mm); } /***************************************************************************//** * compute the Frobenius norm of the current real matrix \f$A\f$, i.e. * \f[ \sqrt{\sum_{i=1}^{N}\sum_{j=1}^{M}\left|A_{i,j}\right|^2} \f] * where \f$N\f$ and \f$M\f$ is the number of rows and columns, respectively * @param[in] scalar real value subtracted from the diagonal elements * @return computed norm ******************************************************************************/ template<> const double NRMat::norm(const double scalar) const { if(!scalar){ #ifdef CUDALA if(location == cpu){ #endif return cblas_dnrm2((size_t)nn*mm, (*this)[0], 1); #ifdef CUDALA }else{ return cublasDnrm2((size_t)nn*mm, v, 1); } #endif } NOT_GPU(*this); double sum(0.0); for(register int i=0; i const double NRMat >::norm(const std::complex scalar) const { if(scalar == CZERO){ #ifdef CUDALA if(location == cpu){ #endif return cblas_dznrm2((size_t)nn*mm, (*this)[0], 1); #ifdef CUDALA }else{ return cublasDznrm2((size_t)nn*mm, (cuDoubleComplex*)v, 1); } #endif } NOT_GPU(*this); double sum(0.0); for(register int i=0; i tmp(0.0, 0.0); #ifdef MATPTR tmp = v[i][j]; #else tmp = v[i*(size_t)mm+j]; #endif if(i == j) tmp -= scalar; const double re = tmp.real(); const double im = tmp.imag(); sum += re*re + im*im; } return std::sqrt(sum); } /***************************************************************************//** * perform the axpy operation on the current real matrix \f$A\f$, i.e. * \f[ A \leftarrow A + \alpha{}B \f] * for real matrix \f$B\f$ * @param[in] alpha \f$\alpha\f$ parameter * @param[in] mat real matrix \f$B\f$ ******************************************************************************/ template<> void NRMat::axpy(const double alpha, const NRMat &mat) { #ifdef DEBUG if (nn != mat.nn || mm != mat.mm) laerror("incompatible matrices in NRMat::axpy(...)"); #endif SAME_LOC(*this, mat); copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif cblas_daxpy((size_t)nn*mm, alpha, mat, 1, *this, 1); #ifdef CUDALA }else{ cublasDaxpy((size_t)nn*mm, alpha, mat, 1, *this, 1); } #endif } /***************************************************************************//** * perform the axpy operation on the current complex matrix \f$A\f$, i.e. * \f[ A \leftarrow A + \alpha{}B \f] * for real matrix \f$B\f$ * @param[in] alpha complex parameter \f$\alpha\f$ * @param[in] mat complex matrix \f$B\f$ ******************************************************************************/ template<> void NRMat >::axpy(const std::complex alpha, const NRMat > & mat) { #ifdef DEBUG if (nn != mat.nn || mm != mat.mm) laerror("incompatible matrices in NRMat >::axpy(...)"); #endif SAME_LOC(*this, mat); copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif cblas_zaxpy((size_t)nn*mm, &alpha, mat, 1, (*this)[0], 1); #ifdef CUDALA }else{ const cuDoubleComplex _alpha = make_cuDoubleComplex(alpha.real(), alpha.imag()); cublasZaxpy(nn*mm, _alpha, (cuDoubleComplex*)(mat[0]), 1, (cuDoubleComplex*)(this->v), 1); } #endif } /***************************************************************************//** * compute the trace of current genenal square matrix \f$A\f$, i.e. * \f[ \sum_{i=1}^{N} A_{i,i} \f] * where \f$N\f$ is the order of the matrix ******************************************************************************/ template const T NRMat::trace() const { #ifdef DEBUG if (nn != mm) laerror("nonsquare matrix in NRMat::trace()"); #endif NOT_GPU(*this); T sum(0); #ifdef MATPTR for(register int i=0; idivide == true NULL * \li divide == false pointer to the first element of r ******************************************************************************/ template<> const double* NRMat::diagonalof(NRVec &r, const bool divide, bool cache) const { double *ret(NULL); #ifdef DEBUG if(r.size() != mm) laerror("incompatible vector in NRMat::diagonalof(...)"); #endif double a(0.0);//!< temporary variable for storing the scaling factor SAME_LOC(*this, r); if(divide){ NOT_GPU(*this); } r.copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif if(nn==mm){ #ifdef MATPTR if(divide){ for(int i=0; i< nn; i++) if((a=v[i][i])) r[i] /= a; }else{ for(int i=0; i< nn; i++) r[i] = v[i][i]; } #else if(divide){ int i(0),j(0); for(i=j=0; j tmp(mm, cpu); for(int i=0;i void NRMat::diagonalset(const NRVec &r) { int nnmin= nn<=mm?nn:mm; #ifdef DEBUG if(r.size() != nnmin) laerror("incompatible vectors int NRMat::diagonalset(...)"); #endif SAME_LOC(*this, r); copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif #ifdef MATPTR for (int i=0; i void NRMat >::diagonalset(const NRVec > &r) { int nnmin= nn<=mm?nn:mm; #ifdef DEBUG if(r.size() != nnmin) laerror("incompatible vectors int NRMat >::diagonalset(...)"); #endif SAME_LOC(*this, r); copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif #ifdef MATPTR for (int i=0; iv), mm+1); } #endif } /***************************************************************************//** * perform straightforward orthonormalization via modified Gram-Schmidt process * @param[in] rowcol flag regarding the interpretation of the current matrix * \li \c true the vectors being orthonormalized are stored as rows * \li \c false the vectors being orthonormalized are stored as columns * @param[in] metric pointer to real symmetric matrix stored in packed form which * is used in computing the inner products in the process, the standard inner product * is taken into account for metric == NULL * @return void ******************************************************************************/ template<> void NRMat::orthonormalize(const bool rowcol, const NRSMat *metric) { SAME_LOC(*this, *metric); if(metric){ if(rowcol){ //vectors are stored in rows if((*metric).nrows() != mm) laerror("incompatible metric in NRMat::orthonormalize(rowcol = true)"); #ifdef CUDALA if(location == cpu){ #endif for(register int j=0; j tmp = *metric * (*this).row(i); const double fact = cblas_ddot(mm,(*this)[j],1,tmp,1); cblas_daxpy(mm,-fact,(*this)[i],1,(*this)[j],1); } const NRVec tmp = *metric * (*this).row(j); const double norm = cblas_ddot(mm,(*this)[j],1,tmp,1); if(norm <= 0.) laerror("zero vector or nonpositive metric in NRMat::orthonormalize(...)"); cblas_dscal(mm,1./std::sqrt(norm),(*this)[j],1); } #ifdef CUDALA }else{ for(register int j=0; j tmp(mm, location); tmp = *metric * (*this).row(i); const double fact = cublasDdot(mm, (*this)[j], 1, tmp, 1); cublasDaxpy(mm, -fact, (*this)[i], 1, (*this)[j], 1); } NRVec tmp(mm, location); tmp = *metric * (*this).row(j); const double norm = cublasDdot(mm, (*this)[j], 1, tmp, 1); if(norm <= 0.) laerror("zero vector or nonpositive metric in NRMat::orthonormalize(...)"); cublasDscal(mm, 1./std::sqrt(norm), (*this)[j], 1); } } #endif }else{ //vectors are stored in columns #ifdef CUDALA if(location = cpu){ #endif if((*metric).nrows() != nn) laerror("incompatible metric in NRMat::orthonormalize(rowcol = false)"); for(register int j=0; j tmp = *metric * (*this).column(i); double fact = cblas_ddot(nn, &(*this)[0][j], mm, tmp, 1); cblas_daxpy(nn, -fact, &(*this)[0][i], mm, &(*this)[0][j], mm); } NRVec tmp = *metric * (*this).column(j); double norm = cblas_ddot(nn, &(*this)[0][j], mm, tmp, 1); if(norm <= 0.) laerror("zero vector or nonpositive metric in NRMat::orthonormalize(...)"); cblas_dscal(nn, 1./std::sqrt(norm), &(*this)[0][j], mm); } #ifdef CUDALA }else{ if((*metric).nrows() != nn) laerror("incompatible metric in NRMat::orthonormalize(rowcol = false)"); for(register int j=0; j tmp(nn, location); tmp = *metric * (*this).column(i); double fact = cublasDdot(nn, &(*this)[0][j], mm, tmp, 1); cublasDaxpy(nn, -fact, &(*this)[0][i], mm, &(*this)[0][j], mm); } NRVec tmp(nn, location); tmp = *metric * (*this).column(j); double norm = cublasDdot(nn, &(*this)[0][j], mm, tmp, 1); if(norm <= 0.) laerror("zero vector or nonpositive metric in NRMat::orthonormalize(...)"); cublasDscal(nn, 1./std::sqrt(norm), &(*this)[0][j], mm); } } #endif } }else{ //unit metric (standard inner product) will be used if(rowcol){ #ifdef CUDALA if(location == cpu){ #endif for(register int j=0; j::orthonormalize(...)"); cblas_dscal(mm,1./norm,(*this)[j],1); } #ifdef CUDALA }else{ for(register int j=0; j::orthonormalize(...)"); cublasDscal(mm, 1./norm, (*this)[j], 1); } } #endif }else{ // vectors are stored in columns #ifdef CUDALA if(location == cpu){ #endif for(register int j=0; j::orthonormalize(...)"); cblas_dscal(nn, 1./norm, &(*this)[0][j], mm); } #ifdef CUDALA }else{ for(register int j=0; j::orthonormalize(...)"); cublasDscal(nn, 1./norm, &(*this)[0][j], mm); } } #endif } } //end of the unit-metric branch } /***************************************************************************//** * interchange the order of the rows of the current (real) matrix * @return reference to the modified matrix ******************************************************************************/ template<> NRMat& NRMat::swap_rows(){ copyonwrite(); const int n_pul = this->nn >> 1; #ifdef CUDALA if(location == cpu){ #endif for(register int i=0; i NRMat& NRMat::swap_rows(const int i, const int j){ copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif cblas_dswap(mm, (*this)[i], 1, (*this)[j], 1); #ifdef CUDALA }else{ cublasDswap(mm, v + i*(size_t)mm, 1, v + j*mm, 1); TEST_CUBLAS("cublasDswap"); } #endif return *this; } /***************************************************************************//** * interchange the order of the rows of the current (complex) matrix * @return reference to the modified matrix ******************************************************************************/ template<> NRMat >& NRMat >::swap_rows(){ copyonwrite(); const int n_pul = this->nn >> 1; #ifdef CUDALA if(location == cpu){ #endif for(register int i=0; i NRMat >& NRMat >::swap_rows(const int i, const int j){ copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif cblas_zswap(mm, (*this)[i], 1, (*this)[j], 1); #ifdef CUDALA }else{ cublasZswap(mm, (cuDoubleComplex*)(v + i*(size_t)mm), 1, (cuDoubleComplex*)(v + j*(size_t)mm), 1); TEST_CUBLAS("cublasZswap"); } #endif return *this; } /***************************************************************************//** * interchange the order of the rows of the current general matrix of type T * for GPU computations, the condition sizeof(T)%sizeof(float) is required * @return reference to the modified matrix ******************************************************************************/ template NRMat& NRMat::swap_rows(){ T tmp; copyonwrite(); const int n_pul = this->nn >> 1; #ifdef CUDALA if(location == cpu){ #endif for(register int i=0; i::swap_rows"); for(register int i=0; i NRMat& NRMat::swap_cols(){ copyonwrite(); const int m_pul = mm >> 1; #ifdef CUDALA if(location == cpu){ #endif for(register int i=0; i NRMat& NRMat::swap_cols(const int i, const int j){ copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif cblas_dswap(nn, &((*this)(0, i)), mm, &((*this)(0, j)), mm); #ifdef CUDALA }else{ cublasDswap(nn, v + i, mm, v + j, mm); TEST_CUBLAS("cublasDswap"); } #endif return *this; } /***************************************************************************//** * interchange the order of the columns of the current (complex) matrix * @return reference to the modified matrix ******************************************************************************/ template<> NRMat >& NRMat >::swap_cols(){ copyonwrite(); const int m_pul = mm >> 1; #ifdef CUDALA if(location == cpu){ #endif for(register int i=0; i NRMat >& NRMat >::swap_cols(const int i, const int j){ copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif cblas_zswap(nn, &((*this)(0, i)), mm, &((*this)(0, j)), mm); #ifdef CUDALA }else{ cublasZswap(nn, (cuDoubleComplex*)(v + i), mm, (cuDoubleComplex*)(v + j), mm); TEST_CUBLAS("cublasZswap"); } #endif return *this; } /***************************************************************************//** * interchange the order of the columns of the current general matrix of type T * because of the cuBlas implementation, the GPU version requires that * sizeof(T)%sizeof(float)==0 * @return reference to the modified matrix ******************************************************************************/ template NRMat& NRMat::swap_cols(){ T tmp; copyonwrite(); const int m_pul = mm >> 1; #ifdef CUDALA if(location == cpu){ #endif for(register int i=0; i::swap_cols"); for(register int i=0; i NRMat& NRMat::swap_cols(const int a, const int b){ T tmp; copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif for(register int j=0;j::swap_cols"); cublasSswap(nn*sizeof(T)/sizeof(float), (float *)(v + a), mm*sizeof(T)/sizeof(float), (float *)(v + b), mm*sizeof(T)/sizeof(float) ); TEST_CUBLAS("cublasSswap"); } #endif return *this; } /*interchange two rows*/ template NRMat& NRMat::swap_rows(const int a, const int b){ T tmp; copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif for(register int j=0;j::swap_rows"); cublasSswap(nn*sizeof(T)/sizeof(float), (float *)(v + a*mm), sizeof(T)/sizeof(float), (float *)(v + b*mm), sizeof(T)/sizeof(float) ); TEST_CUBLAS("cublasSswap"); } #endif return *this; } /*rotate rows or columns of a matrix - general implementation, more efficient version could be done with BLAS scal and axpy operations * but it would require allocation of temporary storage */ template NRMat& NRMat::rotate_rows(const int a, const int b, const T phi){ T tmp1,tmp2; copyonwrite(); T c=cos(phi); T s=sin(phi); #ifdef CUDALA if(location == cpu){ #endif for(register int j=0;j NRMat& NRMat::rotate_cols(const int a, const int b, const T phi){ T tmp1,tmp2; copyonwrite(); T c=cos(phi); T s=sin(phi); #ifdef CUDALA if(location == cpu){ #endif for(register int j=0;j NRMat& NRMat::swap_rows_cols(){ const int n_pul = nn >> 1; const int m_pul = mm >> 1; double tmp(0.0); copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif for(register int i=0; i NRMat >& NRMat >::swap_rows_cols(){ const int n_pul = nn >> 1; const int m_pul = mm >> 1; std::complex tmp(0.0, 0.0); copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif for(register int i=0;i)*mm); cublasZswap(mm, (cuDoubleComplex*)(v + n_pul*mm + mm - 1), -1, (cuDoubleComplex*)gpu_ptr, 1); cublasZcopy(mm, (cuDoubleComplex*)gpu_ptr, 1, (cuDoubleComplex*)(v + n_pul*mm), 1); gpufree(gpu_ptr); } } #endif return *this; } /***************************************************************************//** * interchange the order of the rows and columns of the current * general matrix \f$A\f$ of type T, i.e. perform the operation * \f[A_{i,j}\leftarrow A_{nn-1-i, mm-1-j}\f] * where \f$0\leq{}i\le{}nn\f$ and \f$0\leq{}j\le{}mm\f$ * @return reference to the modified matrix ******************************************************************************/ template NRMat& NRMat::swap_rows_cols(){ const int n_pul = nn >> 1; const int m_pul = mm >> 1; const size_t dim = (size_t)nn*mm; T *data_ptr; T tmp; copyonwrite(); #ifdef CUDALA if(location == cpu){ #endif data_ptr = (*this)[0]; const int dim_pul = dim >> 1; for(register int i=0; i<=dim_pul; i++){ tmp = data_ptr[i]; data_ptr[i] = data_ptr[dim - i - 1]; data_ptr[dim - i - 1] = tmp; } #ifdef CUDALA }else{ if(sizeof(T)%sizeof(float) != 0) laerror("cpu memcpy alignment problem in NRMat::swap_rows_cols"); for(register int i=0; i void NRMat::axpy(const T alpha, const NRPerm &p, const bool direction) { int n=p.size(); for(int i=0; i NRMat::NRMat(const NRPerm &p, const bool direction, const bool parity) { int n=p.size(); resize(n,n); clear(); T alpha= parity? p.parity():1; axpy(alpha,p,direction); } template NRMat::NRMat(const WeightPermutation &wp, const bool direction) { int n=wp.size(); resize(n,n); clear(); axpy(wp.weight,wp.perm,direction); } template NRMat::NRMat(const PermutationAlgebra &ap, const bool direction) { int na= ap.size(); if(na<=0) laerror("cannot deduce matrix size from empty PermutationAlgebra"); int n=ap[0].size(); resize(n,n); clear(); for(int i=0; i const NRMat NRMat::permuted_rows(const NRPerm &p, const bool inverse) const { #ifdef DEBUG if(!p.is_valid()) laerror("invalid permutation of matrix"); #endif int n=p.size(); if(n!=nn) laerror("incompatible permutation and matrix"); #ifdef CUDALA if(this->getlocation() != cpu || p.getlocation() != cpu ) laerror("permutations can be done only in CPU memory"); #endif NRMat r(nn,mm); if(inverse) for(int i=1; i<=n; ++i) {int pi=p[i]-1; for(int j=0; j const NRMat NRMat::permuted_cols(const NRPerm &p, const bool inverse) const { #ifdef DEBUG if(!p.is_valid()) laerror("invalid permutation of matrix"); #endif int n=p.size(); if(n!=mm) laerror("incompatible permutation and matrix"); #ifdef CUDALA if(this->getlocation() != cpu || p.getlocation() != cpu ) laerror("permutations can be done only in CPU memory"); #endif NRMat r(nn,mm); if(inverse) for(int i=1; i<=n; ++i) {int pi=p[i]-1; for(int j=0; j const NRMat NRMat::permuted_both(const NRPerm &p, const NRPerm &q, const bool inverse) const { #ifdef DEBUG if(!p.is_valid() || !q.is_valid() ) laerror("invalid permutation of matrix"); #endif int n=p.size(); int m=q.size(); if(n!=nn ||m!=mm) laerror("incompatible permutation and matrix"); #ifdef CUDALA if(this->getlocation() != cpu || p.getlocation() != cpu ) laerror("permutations can be done only in CPU memory"); #endif NRMat r(nn,mm); if(inverse) for(int i=1; i<=n; ++i) {int pi=p[i]-1; for(int j=1; j<=m; ++j) r(i-1,j-1) = (*this)(pi,q[j]-1);} else for(int i=1; i<=n; ++i) {int pi=p[i]-1; for(int j=1; j<=m; ++j) r(pi,q[j]-1) = (*this)(i-1,j-1);} return r; } template void NRMat::permuteme_rows(const CyclePerm &p) { #ifdef DEBUG if(!p.is_valid()) laerror("invalid permutation of matrix"); #endif if(p.max()>nn) laerror("incompatible permutation and matrix"); #ifdef CUDALA if(this->getlocation() != cpu || p.getlocation() != cpu ) laerror("permutations can be done only in CPU memory"); #endif copyonwrite(); T *tmp = new T[mm]; for(int cycle=1; cycle<=p.size(); ++cycle) { int length= p[cycle].size(); if(length<=1) continue; //trivial cycle for(int j=0; j1; --i) for(int j=0; j void NRMat::permuteme_cols(const CyclePerm &p) { #ifdef DEBUG if(!p.is_valid()) laerror("invalid permutation of matrix"); #endif if(p.max()>mm) laerror("incompatible permutation and matrix"); #ifdef CUDALA if(this->getlocation() != cpu || p.getlocation() != cpu ) laerror("permutations can be done only in CPU memory"); #endif copyonwrite(); T *tmp = new T[nn]; for(int cycle=1; cycle<=p.size(); ++cycle) { int length= p[cycle].size(); if(length<=1) continue; //trivial cycle for(int j=0; j1; --i) for(int j=0; j void NRMat::scale_row(const int i, const double f) { #ifdef DEBUG if(i<0||i>=nn) laerror("index out of range in scale_row"); #endif copyonwrite(); #ifdef CUDALA if(location == cpu) { #endif cblas_dscal(mm, f, &(*this)(i,0), 1); #ifdef CUDALA }else{ cublasDscal(mm, f, v+i*mm, 1); TEST_CUBLAS("cublasDscal"); } #endif } template<> void NRMat::scale_col(const int i, const double f) { #ifdef DEBUG if(i<0||i>=mm) laerror("index out of range in scale_col"); #endif copyonwrite(); #ifdef CUDALA if(location == cpu) { #endif cblas_dscal(nn, f, &(*this)(0,i), mm); #ifdef CUDALA }else{ cublasDscal(nn, f, v+i, mm); TEST_CUBLAS("cublasDscal"); } #endif } template<> void NRMat >::scale_row(const int i, const std::complex f) { #ifdef DEBUG if(i<0||i>=nn) laerror("index out of range in scale_row"); #endif copyonwrite(); #ifdef CUDALA if(location == cpu) { #endif cblas_zscal(mm, &f, &(*this)(i,0), 1); #ifdef CUDALA }else{ const cuDoubleComplex fac = *(reinterpret_cast (&f)); cublasZscal(mm, &fac, v+i*mm, 1); TEST_CUBLAS("cublasDscal"); } #endif } template<> void NRMat >::scale_col(const int i, const std::complex f) { #ifdef DEBUG if(i<0||i>=mm) laerror("index out of range in scale_col"); #endif copyonwrite(); #ifdef CUDALA if(location == cpu) { #endif cblas_zscal(nn, &f, &(*this)(0,i), mm); #ifdef CUDALA }else{ const cuDoubleComplex fac = *(reinterpret_cast (&f)); cublasZscal(nn, &fac, v+i, mm); TEST_CUBLAS("cublasDscal"); } #endif } template<> NRMat NRMat::inverse() { NRMat tmp(*this); return calcinverse(tmp); } template<> NRMat > NRMat >::inverse() { NRMat > tmp(*this); return calcinverse(tmp); } //general version template void NRMat::scale_row(const int i, const T f) { #ifdef DEBUG if(i<0||i>=nn) laerror("index out of range in scale_row"); #endif copyonwrite(); for(int j=0; j void NRMat::scale_col(const int i, const T f) { #ifdef DEBUG if(i<0||i>=mm) laerror("index out of range in scale_col"); #endif copyonwrite(); for(int j=0; j; template class NRMat >; //template class NRMat; //template class NRMat >; template class NRMat; template class NRMat; template class NRMat; template class NRMat; template class NRMat; template class NRMat; template class NRMat; template class NRMat; template class NRMat; template class NRMat; }//namespace