*** empty log message ***

2010-09-08 13:30:20 +00:00 · 2010-09-08 13:30:20 +00:00 · 1b85da3291
commit 1b85da3291
parent 074c943862
10 changed files with 49 additions and 19 deletions
--- a/4
+++ b/4
@ -1,3 +1,7 @@
+25.06.2010 Added basic CUBLAS support for NRVec, NRMat, NRSMat
+24.06.2010 Fixed a memory leak existing when MATPTR  was defined
+18.06.2010 added autoconf support for BLAS+LAPACK compiled with 64-bit integers and for CUBLAS
+11.06.2010 interface to fortran BLAS+LAPACK compiled with 64-bit integers contributed by L. Veis
 25.02.2010 linear_solve_x and multiply_by_inverse contributed by M. Sulc
 17.01.2010 miscellaneous extensions contributed by M. Sulc
 17.01.2010 bugfix in NRMat::operator *= +=  -= for non-square matrix
--- a/Makefile.am
+++ b/Makefile.am
@ -6,9 +6,11 @@ t_SOURCES = t.cc t2.cc
 test_SOURCES = test.cc
 LDADD = .libs/libla.a
 ACLOCAL_AMFLAGS = -I m4
-
 EXTRA_DIST = LICENSE 

+.cu.o:
+	$(NVCC) -o $@ -c $< $(NVCCFLAGS)
+
 #todo: achieve portability of the fortran calls via autoconf?

 #use ./configure CXXFLAGS="" LDFLAGS="" to avoid defaults
--- a/configure.ac
+++ b/configure.ac
@ -33,6 +33,15 @@ AC_CHECK_LIB([lapack], [dgeev_],, [
        ])


+MATPTROPT=""
+AC_ARG_ENABLE([matptr],[  --enable-matptr   switch to double** matrix representation (CUDA incompatible)  [[default=no]]],
+    [case "${enableval}" in
+     yes) MATPTROPT="-DMATPTR" ;;
+     no)  ;;
+     *) AC_MSG_ERROR([bad value ${enableval} for --enable-matptr]) ;;
+     esac],
+ ,)
+
 #check for optional libraries

 #cblas and clapack available?
@ -46,10 +55,13 @@ AC_CHECK_HEADER([clapack.h],,[CLAPACKOPT=-DNONCLAPACK], AC_INCLUDES_DEFAULT)
 AC_SUBST([CLAPACKOPT])

 #CUDA available? link with cublas and avoid cblas and clapack then...
-AC_CHECK_LIB([cublas], [cublasInit], [CUDALIBS=-lcublas CUDAOPT=-DCUDALA CBLASOPT=-DNONCBLAS CLAPACKOPT=-DNONCLAPACK CBLASLIB=""], [CUDALIB="" CUDAOPT=""])
+AC_CHECK_LIB([cublas], [cublasInit], [MATPTROPT="" NVCC=nvcc NVCCFLAGS="-O -arch sm_20" CUDALIBS=-lcublas CUDAOPT=-DCUDALA CBLASOPT=-DNONCBLAS CLAPACKOPT=-DNONCLAPACK CBLASLIB=""], [CUDALIB="" CUDAOPT=""])
 AC_CHECK_HEADER([cublas.h],,[CUDAOPT="" CUDALIBS=""], AC_INCLUDES_DEFAULT)
 AC_SUBST([CUDALIBS])
 AC_SUBST([CUDAOPT])
+AC_SUBST([NVCC])
+AC_SUBST([NVCCFLAGS])
+AC_SUBST([MATPTROPT])


 #the check for traceback needs bfd to be linked into
@ -94,16 +106,6 @@ AC_ARG_ENABLE([debug],[  --disable-debug   not to perform some range-checking [[
 ,)
 AC_SUBST([DEBUGOPT])

-MATPTROPT=""
-AC_ARG_ENABLE([matptr],[  --enable-matptr   switch to double** matrix representation  [[default=no]]],
-    [case "${enableval}" in
-     yes) MATPTROPT="-DMATPTR" ;;
-     no)  ;;
-     *) AC_MSG_ERROR([bad value ${enableval} for --enable-matptr]) ;;
-     esac],
- ,)
-AC_SUBST([MATPTROPT])
-



@ -132,4 +134,5 @@ echo "In addition, similarly you might set include and link paths for Nvidia CUB
 echo "For usage examples see file t.cc. Do not forget using copyonwrite() before        "
 echo "changing individual matrix/vector elements via l.h.s. operator[] or operator()    "
 echo "**********************************************************************************"
+#echo "Use  ./configure --disable-optimize CXXFLAGS="" LDFLAGS="" for a fast compile     "

--- a/cuda_la.h
+++ b/cuda_la.h
@ -11,10 +11,12 @@
 namespace LA {

 #ifdef CUDALA
+#define CPU_GPU(x,y) {if((x)!=cpu && (y)!=cpu) laerror("one operand must be in CPU memory");}
 #define NOT_GPU(x) {if((x).getlocation()!=cpu) laerror("Operation not implemented on GPU (yet). Use .moveto(0) first.");}
 #define SAME_LOC(x,y) {if((x).getlocation()!=(y).getlocation()) laerror("Operands have different location. Use .moveto() first.");}
 #define SAME_LOC3(x,y,z) {if((x).getlocation()!=(y).getlocation() || (x).getlocation()!=(z).getlocation()) laerror("Operands have different location. Use .moveto() first.");}
 #else
+#define CPU_GPU(x,y) {}
 #define NOT_GPU(x) {}
 #define SAME_LOC(x,y) {}
 #define SAME_LOC3(x,y,z) {}
--- a/fortran.h
+++ b/fortran.h
@ -4,4 +4,10 @@
 #define FORNAME(x) x
 #endif

-#define FORTRAN_INT long
+#ifdef LONG_FORTRAN_INT
+#define FORINT
+#define FINT long
+#else
+#undef FORINT
+#define FINT int
+#endif
--- a/mat.cc
+++ b/mat.cc
@ -355,24 +355,29 @@ NRMat<T> & NRMat<T>::operator-=(const T &a)
 	return *this;
 }

+
 template <>
 const NRMat<double> NRMat<double>::operator-() const
 {
-        NRMat<double> result(nn, mm);
 #ifdef CUDALA
+        NRMat<double> result(nn, mm, location);
        if(location==cpu)
 	{
+#else
+	 NRMat<double> result(nn, mm);
 #endif
 #ifdef MATPTR
        for (int i=0; i<nn*mm; i++) result.v[0][i]= -v[0][i];
 #else
-	cblas_dscal(nn*mm, -1., v, 1);
+	memcpy(result.v,v,nn*mm*sizeof(double));
+	cblas_dscal(nn*mm, -1., result.v, 1);
 #endif
 #ifdef CUDALA
        }
 	else
 	{
-	cublasDscal(nn*mm, -1., v, 1);
+	cublasDcopy(nn*mm, v, 1, result.v, 1);
+	cublasDscal(nn*mm, -1., result.v, 1);
 	}
 #endif
        return result;
--- a/mat.h
+++ b/mat.h
@ -816,6 +816,7 @@ template<typename T>
 void NRMat<T>::moveto(const GPUID dest)
 {
 if(location==dest) return;
+CPU_GPU(location,dest);
 location=dest;

 if(v && !count) laerror("internal inconsistency of reference counting 1");
--- a/smat.h
+++ b/smat.h
@ -656,6 +656,7 @@ template<typename T>
 void NRSMat<T>::moveto(const GPUID dest)
 {
 if(location==dest) return;
+CPU_GPU(location,dest);
 location=dest;

 if(v && !count) laerror("internal inconsistency of reference counting 1");
--- a/sparsemat.cc
+++ b/sparsemat.cc
@ -23,6 +23,7 @@
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <errno.h>
+#include "bitvector.h"
 #include "sparsemat.h"

 namespace LA {
@ -918,14 +919,17 @@ const_cast<SparseMat<T> *>(this)->simplify();

 matel<T> *l=list;
 typename LA_traits<T>::normtype sum(0);
+
 if(scalar!=(T)0)
 	{
+	if(nn!=mm) laerror("subtraction of scalar from non-square sparse matrix in norm()");
+	bitvector has_diagonal_element(nn); has_diagonal_element.clear();
 	if(symmetric) 
 	while(l)
 	        {
 		T hlp=l->elem;
-		bool b=l->row==l->col;
-		if(b) hlp-=scalar;
+		bool b= l->row==l->col;
+		if(b) {hlp-=scalar; has_diagonal_element.set(l->row);}
 		typename LA_traits<T>::normtype tmp=LA_traits<T>::sqrabs(hlp);
       	 sum+= tmp;
 		if(!b) sum+=tmp;
@ -935,10 +939,11 @@ if(scalar!=(T)0)
 	while(l)
 	       	 {
 		T hlp=l->elem;
-		if(l->row==l->col) hlp-=scalar;
+		if(l->row==l->col) {hlp-=scalar; has_diagonal_element.set(l->row);}
 	        sum+= LA_traits<T>::sqrabs(hlp);
 	        l=l->next;
 	        }
+	sum += (nn-has_diagonal_element.population()) * LA_traits<T>::sqrabs(scalar); //add contribution of the subtracted scalar from zero non-stored diagonal elements
 	}
 else
 	{
--- a/vec.h
+++ b/vec.h
@ -711,6 +711,7 @@ template<typename T>
 void NRVec<T>::moveto(const GPUID dest)
 {
 if(location==dest) return;
+CPU_GPU(location,dest);
 location=dest;

 if(v && !count) laerror("internal inconsistency of reference counting 1");