From ea2b494abb4271d8d644d424bf91965d09009f37 Mon Sep 17 00:00:00 2001
From: Jiri Pittner <jiri@pittnerovi.com>
Date: Fri, 17 May 2024 16:17:43 +0200
Subject: [PATCH] NRVec: constructor with GPU location option, otimes2vec

---
 vec.cc | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 vec.h  | 33 ++++++++++++++++++++++++++++++++-
 2 files changed, 88 insertions(+), 2 deletions(-)
diff --git a/vec.cc b/vec.cc
index a048009..8a457d6 100644
--- a/vec.cc
+++ b/vec.cc
@@ -704,6 +704,24 @@ const NRMat<double> NRVec<double>::otimes(const NRVec<double> &b,const bool conj
 	return result;
 }
 
+template<>
+const NRVec<double> NRVec<double>::otimes2vec(const NRVec<double> &b,const bool conj, const double &scale) const {
+
+	SAME_LOC(*this, b);
+	NRVec<double> result(0.0, nn*b.nn, this->getlocation());
+#ifdef CUDALA
+	if(location == cpu){
+#endif
+		cblas_dger(CblasRowMajor, nn, b.nn, scale, v, 1, b.v, 1, result.v, b.nn);
+#ifdef CUDALA
+	}else{
+		cublasDger(b.nn, nn, scale, b.v, 1, v, 1, result.v, b.nn);
+		TEST_CUBLAS("cublasDger");
+	}
+#endif
+	return result;
+}
+
 /***************************************************************************//**
  * computes the outer product of this complex vector \f$\vec{a}\f$ with given
  * complex vector \f$\vec{b}\f$ and scales the resulting matrix with factor \f$\alpha\f$, i.e.
@@ -750,6 +768,42 @@ NRVec<std::complex<double> >::otimes(const NRVec<std::complex<double> > &b, cons
 	return result;
 }
 
+template<>
+const NRVec<std::complex<double> > 
+NRVec<std::complex<double> >::otimes2vec(const NRVec<std::complex<double> > &b, const bool conj, const std::complex<double> &scale) const {
+	
+	SAME_LOC(*this, b);
+	NRVec<std::complex<double> > result(0., nn*b.nn, this->getlocation());
+
+#ifdef CUDALA
+	if(location == cpu){
+#endif
+		if(conj){
+			cblas_zgerc(CblasRowMajor, nn, b.nn, &scale, v, 1, b.v, 1, result.v, b.nn);
+		}else{
+			cblas_zgeru(CblasRowMajor, nn, b.nn, &scale, v, 1, b.v, 1, result.v, b.nn);
+		}
+#ifdef CUDALA
+	}else{
+		if(conj){
+			const cuDoubleComplex alpha = make_cuDoubleComplex(scale.real(), -scale.imag());
+
+			cublasZgerc(b.nn, nn, alpha, (cuDoubleComplex*)(b.v), 1, (cuDoubleComplex*)(v), 1, (cuDoubleComplex*)(result.v), b.nn);
+			TEST_CUBLAS("cublasZgerc");
+
+			result.conjugateme();
+		}else{
+			const cuDoubleComplex alpha = make_cuDoubleComplex(scale.real(), +scale.imag());
+
+			cublasZgeru(b.nn, nn, alpha, (cuDoubleComplex*)(b.v), 1, (cuDoubleComplex*)(v), 1, (cuDoubleComplex*)(result.v), b.nn);
+			TEST_CUBLAS("cublasZgeru");
+		}
+	}
+#endif
+	return result;
+}
+
+
 template<>
 NRVec<std::complex<double> > complexify(const NRVec<double> &rhs) {
 	NRVec<std::complex<double> > r(rhs.size(), rhs.getlocation());
@@ -988,7 +1042,8 @@ template<> void NRVec<T>::gemv(const T beta, const SparseMat<T> &a, const char t
 template<> void NRVec<T>::gemv(const LA_traits_complex<T>::Component_type beta, const  LA_traits_complex<T>::NRMat_Noncomplex_type  &a, const char trans,  const  LA_traits_complex<T>::Component_type alpha, const NRVec<T> &x) { laerror("gemv on unsupported types"); } \
 template<> void NRVec<T>::gemv(const  LA_traits_complex<T>::Component_type beta, const  LA_traits_complex<T>::NRSMat_Noncomplex_type  &a, const char trans,  const  LA_traits_complex<T>::Component_type alpha, const NRVec<T> &x) { laerror("gemv on unsupported types"); } \
 template<> NRVec<T> & NRVec<T>::normalize(LA_traits<T>::normtype *) {laerror("normalize() impossible for integer types"); return *this;} \
-template<> const NRMat<T> NRVec<T>::otimes(const NRVec<T> &b,const bool conj, const T &scale) const {laerror("otimes presently implemented only for double and complex double"); return NRMat<T> ();}
+template<> const NRMat<T> NRVec<T>::otimes(const NRVec<T> &b,const bool conj, const T &scale) const {laerror("otimes presently implemented only for double and complex double"); return NRMat<T> ();}\
+template<> const NRVec<T> NRVec<T>::otimes2vec(const NRVec<T> &b,const bool conj, const T &scale) const {laerror("otimes2vec presently implemented only for double and complex double"); return NRVec<T> ();}\
 
 
 
diff --git a/vec.h b/vec.h
index c043007..d68fb38 100644
--- a/vec.h
+++ b/vec.h
@@ -130,7 +130,8 @@ public:
 	};
 	
 	//! inlined constructor creating vector of given size filled with prescribed value 
-	inline NRVec(const T &a, const int n);
+	//inline NRVec(const T &a, const int n);
+	inline NRVec(const T &a, const int n, const GPUID loc = undefined);
 
 	//! inlined constructor creating vector froman array
 	template<int SIZE> inline NRVec(const T (&a)[SIZE]);
@@ -356,9 +357,13 @@ public:
 	
 	//! compute the outer product of two vectors 
 	const NRMat<T> otimes(const NRVec<T> &rhs, const bool conjugate = false, const T &scale = 1) const;
+
 	//! opeartor for outer product computation
 	inline const NRMat<T> operator|(const NRVec<T> &rhs) const { return otimes(rhs,true); };
 
+        //! compute the outer product of two vectors, result interpreted as a vector
+        const NRVec otimes2vec(const NRVec<T> &rhs, const bool conjugate = false, const T &scale = 1) const;
+
 	//! compute the sum of the vector elements 
 	inline const T sum() const {
 		T sum(v[0]);
@@ -678,6 +683,8 @@ std::istream & operator>>(std::istream &s, NRVec<T> &x) {
  * @param[in] a value to be assigned to all vector elements
  * @param[in] n required vector size
  ******************************************************************************/
+
+/* replaced by the one with optional GPUID
 template <typename T>
 inline NRVec<T>::NRVec(const T& a, const int n): nn(n), count(new int) {
 	*count = 1;
@@ -699,6 +706,30 @@ inline NRVec<T>::NRVec(const T& a, const int n): nn(n), count(new int) {
 	}
 #endif
 }
+*/
+
+template <typename T>
+inline NRVec<T>::NRVec(const T& a, const int n,  const GPUID loc): nn(n), count(new int) {
+	*count = 1;
+#ifdef CUDALA
+	location = (loc==undefined?DEFAULT_LOC:loc);
+	if(location == cpu){
+#endif
+		v = new T[n];
+                if(!LA_traits<T>::is_plaindata() || a != (T)0){
+                        for(register int i=0; i<n; i++) v[i] = a;
+                }else{
+                        memset(v, 0, nn*sizeof(T));
+               }
+#ifdef CUDALA
+	}else{
+		if(sizeof(T)%sizeof(float) != 0)laerror("memory alignment error");
+		v = (T*) gpualloc(n*sizeof(T));
+		if(!LA_traits<T>::is_plaindata()) laerror("only implemented for plain data");
+		smart_gpu_set(n, a, v);
+	}
+#endif
+}
 
 /***************************************************************************//**
  * inline constructor creating vector from an array