*** empty log message ***

2010-09-08 13:30:20 +00:00
parent 074c943862
commit 1b85da3291
10 changed files with 49 additions and 19 deletions
--- a/4
+++ b/4
@@ -1,3 +1,7 @@
 25.06.2010 Added basic CUBLAS support for NRVec, NRMat, NRSMat
 24.06.2010 Fixed a memory leak existing when MATPTR  was defined
 18.06.2010 added autoconf support for BLAS+LAPACK compiled with 64-bit integers and for CUBLAS
 11.06.2010 interface to fortran BLAS+LAPACK compiled with 64-bit integers contributed by L. Veis
 25.02.2010 linear_solve_x and multiply_by_inverse contributed by M. Sulc
 17.01.2010 miscellaneous extensions contributed by M. Sulc
 17.01.2010 bugfix in NRMat::operator *= +=  -= for non-square matrix
--- a/Makefile.am
+++ b/Makefile.am
@@ -6,9 +6,11 @@ t_SOURCES = t.cc t2.cc
 test_SOURCES = test.cc
 LDADD = .libs/libla.a
 ACLOCAL_AMFLAGS = -I m4
 EXTRA_DIST = LICENSE 
 .cu.o:
 	$(NVCC) -o $@ -c $< $(NVCCFLAGS)
 #todo: achieve portability of the fortran calls via autoconf?
 #use ./configure CXXFLAGS="" LDFLAGS="" to avoid defaults
--- a/configure.ac
+++ b/configure.ac
@@ -33,6 +33,15 @@ AC_CHECK_LIB([lapack], [dgeev_],, [
        ])
 MATPTROPT=""
 AC_ARG_ENABLE([matptr],[  --enable-matptr   switch to double** matrix representation (CUDA incompatible)  [[default=no]]],
    [case "${enableval}" in
     yes) MATPTROPT="-DMATPTR" ;;
     no)  ;;
     *) AC_MSG_ERROR([bad value ${enableval} for --enable-matptr]) ;;
     esac],
 ,)
 #check for optional libraries
 #cblas and clapack available?
@@ -46,10 +55,13 @@ AC_CHECK_HEADER([clapack.h],,[CLAPACKOPT=-DNONCLAPACK], AC_INCLUDES_DEFAULT)
 AC_SUBST([CLAPACKOPT])
 #CUDA available? link with cublas and avoid cblas and clapack then...
-AC_CHECK_LIB([cublas], [cublasInit], [CUDALIBS=-lcublas CUDAOPT=-DCUDALA CBLASOPT=-DNONCBLAS CLAPACKOPT=-DNONCLAPACK CBLASLIB=""], [CUDALIB="" CUDAOPT=""])
+AC_CHECK_LIB([cublas], [cublasInit], [MATPTROPT="" NVCC=nvcc NVCCFLAGS="-O -arch sm_20" CUDALIBS=-lcublas CUDAOPT=-DCUDALA CBLASOPT=-DNONCBLAS CLAPACKOPT=-DNONCLAPACK CBLASLIB=""], [CUDALIB="" CUDAOPT=""])
 AC_CHECK_HEADER([cublas.h],,[CUDAOPT="" CUDALIBS=""], AC_INCLUDES_DEFAULT)
 AC_SUBST([CUDALIBS])
 AC_SUBST([CUDAOPT])
 AC_SUBST([NVCC])
 AC_SUBST([NVCCFLAGS])
 AC_SUBST([MATPTROPT])
 #the check for traceback needs bfd to be linked into
@@ -94,16 +106,6 @@ AC_ARG_ENABLE([debug],[  --disable-debug   not to perform some range-checking [[
 ,)
 AC_SUBST([DEBUGOPT])
 MATPTROPT=""
 AC_ARG_ENABLE([matptr],[  --enable-matptr   switch to double** matrix representation  [[default=no]]],
    [case "${enableval}" in
     yes) MATPTROPT="-DMATPTR" ;;
     no)  ;;
     *) AC_MSG_ERROR([bad value ${enableval} for --enable-matptr]) ;;
     esac],
 ,)
 AC_SUBST([MATPTROPT])
@@ -132,4 +134,5 @@ echo "In addition, similarly you might set include and link paths for Nvidia CUB
 echo "For usage examples see file t.cc. Do not forget using copyonwrite() before        "
 echo "changing individual matrix/vector elements via l.h.s. operator[] or operator()    "
 echo "**********************************************************************************"
 #echo "Use  ./configure --disable-optimize CXXFLAGS="" LDFLAGS="" for a fast compile     "
--- a/cuda_la.h
+++ b/cuda_la.h
@@ -11,10 +11,12 @@
 namespace LA {
 #ifdef CUDALA
 #define CPU_GPU(x,y) {if((x)!=cpu && (y)!=cpu) laerror("one operand must be in CPU memory");}
 #define NOT_GPU(x) {if((x).getlocation()!=cpu) laerror("Operation not implemented on GPU (yet). Use .moveto(0) first.");}
 #define SAME_LOC(x,y) {if((x).getlocation()!=(y).getlocation()) laerror("Operands have different location. Use .moveto() first.");}
 #define SAME_LOC3(x,y,z) {if((x).getlocation()!=(y).getlocation() || (x).getlocation()!=(z).getlocation()) laerror("Operands have different location. Use .moveto() first.");}
 #else
 #define CPU_GPU(x,y) {}
 #define NOT_GPU(x) {}
 #define SAME_LOC(x,y) {}
 #define SAME_LOC3(x,y,z) {}
--- a/fortran.h
+++ b/fortran.h
@@ -4,4 +4,10 @@
 #define FORNAME(x) x
 #endif
-#define FORTRAN_INT long
+#ifdef LONG_FORTRAN_INT
 #define FORINT
 #define FINT long
 #else
 #undef FORINT
 #define FINT int
 #endif
--- a/mat.cc
+++ b/mat.cc
@@ -355,24 +355,29 @@ NRMat<T> & NRMat<T>::operator-=(const T &a)
 	return *this;
 }
 template <>
 const NRMat<double> NRMat<double>::operator-() const
 {
        NRMat<double> result(nn, mm);
 #ifdef CUDALA
        NRMat<double> result(nn, mm, location);
        if(location==cpu)
 	{
 #else
 	 NRMat<double> result(nn, mm);
 #endif
 #ifdef MATPTR
        for (int i=0; i<nn*mm; i++) result.v[0][i]= -v[0][i];
 #else
-	cblas_dscal(nn*mm, -1., v, 1);
+	memcpy(result.v,v,nn*mm*sizeof(double));
 	cblas_dscal(nn*mm, -1., result.v, 1);
 #endif
 #ifdef CUDALA
        }
 	else
 	{
-	cublasDscal(nn*mm, -1., v, 1);
+	cublasDcopy(nn*mm, v, 1, result.v, 1);
 	cublasDscal(nn*mm, -1., result.v, 1);
 	}
 #endif
        return result;
--- a/mat.h
+++ b/mat.h
@@ -816,6 +816,7 @@ template<typename T>
 void NRMat<T>::moveto(const GPUID dest)
 {
 if(location==dest) return;
 CPU_GPU(location,dest);
 location=dest;
 if(v && !count) laerror("internal inconsistency of reference counting 1");
--- a/smat.h
+++ b/smat.h
@@ -656,6 +656,7 @@ template<typename T>
 void NRSMat<T>::moveto(const GPUID dest)
 {
 if(location==dest) return;
 CPU_GPU(location,dest);
 location=dest;
 if(v && !count) laerror("internal inconsistency of reference counting 1");
--- a/sparsemat.cc
+++ b/sparsemat.cc
@@ -23,6 +23,7 @@
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <errno.h>
 #include "bitvector.h"
 #include "sparsemat.h"
 namespace LA {
@@ -918,14 +919,17 @@ const_cast<SparseMat<T> *>(this)->simplify();
 matel<T> *l=list;
 typename LA_traits<T>::normtype sum(0);
 if(scalar!=(T)0)
 	{
 	if(nn!=mm) laerror("subtraction of scalar from non-square sparse matrix in norm()");
 	bitvector has_diagonal_element(nn); has_diagonal_element.clear();
 	if(symmetric) 
 	while(l)
 	        {
 		T hlp=l->elem;
-		bool b=l->row==l->col;
+		bool b= l->row==l->col;
-		if(b) hlp-=scalar;
+		if(b) {hlp-=scalar; has_diagonal_element.set(l->row);}
 		typename LA_traits<T>::normtype tmp=LA_traits<T>::sqrabs(hlp);
       	 sum+= tmp;
 		if(!b) sum+=tmp;
@@ -935,10 +939,11 @@ if(scalar!=(T)0)
 	while(l)
 	       	 {
 		T hlp=l->elem;
-		if(l->row==l->col) hlp-=scalar;
+		if(l->row==l->col) {hlp-=scalar; has_diagonal_element.set(l->row);}
 	        sum+= LA_traits<T>::sqrabs(hlp);
 	        l=l->next;
 	        }
 	sum += (nn-has_diagonal_element.population()) * LA_traits<T>::sqrabs(scalar); //add contribution of the subtracted scalar from zero non-stored diagonal elements
 	}
 else
 	{
--- a/vec.h
+++ b/vec.h
@@ -711,6 +711,7 @@ template<typename T>
 void NRVec<T>::moveto(const GPUID dest)
 {
 if(location==dest) return;
 CPU_GPU(location,dest);
 location=dest;
 if(v && !count) laerror("internal inconsistency of reference counting 1");