t-hishinuma · t-hishinuma · Sep 6, 2020 · Sep 6, 2020 · Sep 6, 2020 · Sep 6, 2020
diff --git a/.github/workflows/c-cpp.yml b/.github/workflows/c-cpp.yml
@@ -32,6 +32,9 @@ jobs:
     - name: test matrix
       run: cd test; make test_matrix
 
+    - name: test cg
+      run: cd sample/; make; make test
+
   avx2:
     runs-on: ubuntu-latest
 
@@ -56,3 +59,6 @@ jobs:
 
     - name: test matrix
       run: cd test; make test_matrix
+
+    - name: test cg
+      run: cd sample/; make; make test
diff --git a/README.md b/README.md
@@ -3,9 +3,9 @@
 # DD-AVX Library: Library of High Precision Sparse Matrix Operations Accelerated by SIMD
 
 ## About
-DD-AVX_v3 is SIMD accelerated simple interface high precision BLAS / Sparse BLAS Library.
+DD-AVX_v3 is SIMD accelerated simple interface high precision BLAS Lv.1 and Sparse BLAS Library.
 
-BLAS / Sparse BLAS operations can be performed by combining double and double-double precision.
+BLAS Lv.1 and Sparse BLAS operations can be performed by combining double and double-double precision.
 
 This library provides an easy way to implement a fast and accurate Krylov subspace method.
 
@@ -19,7 +19,7 @@ This library is extensions of
 This library provides BLAS / Sparse BLAS functions for the following six types.
 
 ### Scalar
-* double
+* d_real (alias of double)
 * dd_real (provided by the QD Library)
 ### Vector
 * d_real_vector
@@ -28,12 +28,14 @@ This library provides BLAS / Sparse BLAS functions for the following six types.
 * d_real_SpMat
 * dd_real_SpMat
 
-It has BLAS Lv 1 and Sparse BLAS functions for these types.
+It has BLAS Lv.1 and Sparse BLAS functions for these types.
 
 All combinations of BLAS functions are implemented.
 It works for both D and DD types.
 
-See the [axpy sample code](https://github.com/t-hishinuma/DD-AVX_v3/blob/master/test/vector_blas/axpy.cpp) for more information on how to use it.
+See the [axpy sample code](https://github.com/t-hishinuma/DD-AVX_v3/blob/master/test/vector_blas/axpy.cpp) and 
+[CG method sample code](https://github.com/t-hishinuma/DD-AVX_v3/blob/master/sample/cg.cpp) 
+for more information on how to use it.
 
 # Build and Install
 This library requires the QD library for scalar operations as a submodule.

diff --git a/sample/Makefile b/sample/Makefile
@@ -13,11 +13,11 @@ $(OBJS): cg.cpp
 	$(CXX) $(CXXFLAGS) $(RPATH) $^ -o $@ $(LIB)
 
 test: trimat.mtx
-	./cg.out ./trimat.mtx
+	./cg.out ./trimat.mtx 1.0e-12 0
 
 trimat.mtx: trimat_gen.cpp
-	g++ trimat_gen.cpp -o trimat_gen.out
-	./trimat_gen.out 100 2 > trimat.mtx
+	g++ trimat_gen.cpp -o trimat_gen.out 
+	./trimat_gen.out 10000 2 > trimat.mtx
 
 clean:
 	- rm $(OBJS)

diff --git a/sample/cg.cpp b/sample/cg.cpp
@@ -2,62 +2,137 @@
 #include<vector>
 #include<iostream>
 #include<chrono>
-#define TOL 1.0e-6
+template<typename X, typename B, typename R, typename P, typename Q, typename ALPHA, typename BETA>
+int cg(d_real_SpMat& A, const double tol, int rhisotry){
 
-int main(int argc, char** argv){
-	bool ret=0;
-
-	if(argc!=2){
-		std::cout << "error, $1 = matrix file" << std::endl;
-		return 1;
-	}
-
-	d_real_SpMat A;
-	A.input_mm(argv[1]);
-
- 	dd_real_vector x(A.get_row(), 123.0);
+    X x(A.get_row(), 2.0);
 
- 	dd_real_vector ans(A.get_row(), 1.0);
- 	dd_real_vector b(A.get_row(), 0.0);
+    X ans(A.get_row(), 1.0);
+    B b(A.get_row(), 0.0);
 
     dd_avx::matvec(A, ans, b); // b = A*1
 
- 	dd_real_vector r(A.get_row(), 0.0);
- 	dd_real_vector rold(A.get_row(), 0.0);
- 	dd_real_vector p(A.get_row(), 0.0);
- 	dd_real_vector q(A.get_row(), 0.0);
+    R r(A.get_row(), 0.0);
+    P p(A.get_row(), 0.0);
+    Q q(A.get_row(), 0.0);
+
+    ALPHA alpha = 0;
+    BETA beta = 0;
 
     dd_avx::matvec(A, x, q);
-	r = b - q;
+    r = b - q;
 
-	//p0 = Mr0
-	p = r;
+    //p0 = Mr0
+    p = r;
 
-    //for(size_t iter = 0; iter < A.get_row(); iter++)
     for(size_t iter = 0; iter < A.get_row(); iter++)
     {
         dd_avx::matvec(A,p,q);
 
         auto tmp = dd_avx::dot(r,r);
-        auto alpha = tmp / dd_avx::dot(p,q);
+        alpha = tmp / dd_avx::dot(p,q);
 
         dd_avx::axpy(alpha, p, x);
 
         dd_avx::axpy(-alpha, q, r);
 
-        auto beta = dd_avx::dot(r,r) / tmp;
+        beta = dd_avx::dot(r,r) / tmp;
 
         dd_avx::xpay(beta, r, p); //p = r + beta*p
 
         double resid = dd_avx::nrm2(r);
-        printf("%e\n", resid);
+        if(rhisotry==1){
+            printf("%ld\t%e\n", iter, resid);
+        }
 
         if( resid < 1.0e-12){
-            printf("iter: %ld\n", iter);
+            printf("iter:\t %ld\n", iter);
             return 0;
         } 
     }
+    printf("iter:\tmaxiter\n");
 
-	return 1;
+    return 1;
 }
 
+int main(int argc, char** argv){
+    bool ret=0;
+
+    if(argc!=4){
+        std::cout << "error, $1 = matrix file, $2=tol, $3=print rhistory(1/0)" << std::endl;
+        return 1;
+    }
+
+    d_real_SpMat A;
+    A.input_mm(argv[1]);
+    double tol = atof(argv[2]);
+    int rhistory = atoi(argv[3]);
+
+    std::cout << "Matrix dimension\t" << A.get_row() << std::endl;
+    std::cout << "Matrix nnz\t" << A.get_nnz() << std::endl;
+    std::cout << "tol\t" << tol << std::endl;
+
+    // all DD
+    std::cout << "===" << std::endl;
+    std::cout << "all DD" << std::endl;
+	auto start = std::chrono::system_clock::now();
+    ret = cg<
+        dd_real_vector, //x
+        dd_real_vector, //b
+        dd_real_vector, //r
+        dd_real_vector, //p
+        dd_real_vector, //q
+        dd_real, //alpha
+        dd_real //beta
+            >(A, tol, rhistory);
+	auto end = std::chrono::system_clock::now();
+	double sec = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count()/1.0e+9/100;
+    if(ret==1){
+        std::cout << "error, maxiter" << std::endl;
+        return 1;
+    }
+	std::cout << "...pass\t" << sec << std::endl;
+
+
+    // all Double
+    std::cout << "===" << std::endl;
+    std::cout << "all Double" << std::endl;
+	start = std::chrono::system_clock::now();
+    ret = cg<
+        d_real_vector, //x
+        d_real_vector, //b
+        d_real_vector, //r
+        d_real_vector, //p
+        d_real_vector, //q
+        d_real, //alpha
+        d_real //beta
+            >(A, tol, rhistory);
+	end = std::chrono::system_clock::now();
+	sec = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count()/1.0e+9/100;
+    if(ret==1){
+        std::cout << "error, maxiter" << std::endl;
+        return 1;
+    }
+	std::cout << "...pass\t" << sec << std::endl;
+
+    // mix
+    std::cout << "===" << std::endl;
+    std::cout << "mix precision" << std::endl;
+	start = std::chrono::system_clock::now();
+    ret = cg<
+        d_real_vector, //x
+        d_real_vector, //b
+        dd_real_vector, //r
+        dd_real_vector, //p
+        d_real_vector, //q
+        dd_real, //alpha
+        dd_real //beta
+            >(A, tol, rhistory);
+	end = std::chrono::system_clock::now();
+	sec = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count()/1.0e+9/100;
+    if(ret==1){
+        std::cout << "error, maxiter" << std::endl;
+        return 1;
+    }
+	std::cout << "...pass\t" << sec << std::endl;
+}
diff --git a/sample/trimat_gen.cpp b/sample/trimat_gen.cpp
@@ -14,15 +14,15 @@ int main(int argc, char** argv){
     std::cout << "%%MatrixMarket matrix coordinate real general" << std::endl;
     std::cout << N << " " << N << " " << N*3-2 << std::endl;
 
-    std::cout << 1 << " " << 1 << " " << 2 << std::endl;
+    std::cout << 1 << " " << 1 << " " << 3 << std::endl;
     std::cout << 1 << " " << 2 << " " << val << std::endl;
 
     for(size_t i = 2; i <= N-1; i++){
     std::cout << i << " " << i-1 << " " << val << std::endl;
-    std::cout << i << " " << i << " " << 2 << std::endl;
+    std::cout << i << " " << i << " " << 3 << std::endl;
     std::cout << i << " " << i+1 << " " << val << std::endl;
     }
 
     std::cout << N << " " << N-1 << " " << val << std::endl;
-    std::cout << N << " " << N << " " << 2 << std::endl;
+    std::cout << N << " " << N << " " << 3 << std::endl;
 }
diff --git a/src/core/AVX2_core.hpp b/src/core/AVX2_core.hpp
@@ -4,18 +4,18 @@
 
 #include <immintrin.h>
 #define SIMD_FUNC(NAME) _mm256_##NAME
-using SIMDreg = __m256d;
+using reg = __m256d;
 struct dd_real;
 
 namespace ddavx_core{
 	const int SIMD_Length=4;
 	class registers{
 		public:
 			double splitter = 134217729.0;
-			SIMDreg sp = SIMD_FUNC(broadcast_sd)(&splitter); 
-			SIMDreg minus = SIMD_FUNC(set_pd)(-1.0,-1.0,-1.0,-1.0);
-			SIMDreg zeros = SIMD_FUNC(set_pd)(0,0,0,0);
-			SIMDreg one,bh,bl,ch,cl,sh,sl,wh,wl,th,tl,p1,p2,t0,t1,t2,eh,t3; 
+			reg sp = SIMD_FUNC(broadcast_sd)(&splitter); 
+			reg minus = SIMD_FUNC(set_pd)(-1.0,-1.0,-1.0,-1.0);
+			reg zeros = SIMD_FUNC(set_pd)(0,0,0,0);
+			reg one,bh,bl,ch,cl,sh,sl,wh,wl,th,tl,p1,p2,t0,t1,t2,eh,t3; 
 	};
 
 
@@ -24,50 +24,50 @@ namespace ddavx_core{
 	////////////////////////////////////////////
 
 	//load
-	inline SIMDreg load(const double& val){
+	inline reg load(const double& val){
 		return SIMD_FUNC(loadu_pd)((double*)&(val)); 
 	}
 
 	//set
 	// need to change for AVX512
-	inline SIMDreg set(const double a, const double b, const double c, const double d){
-		SIMDreg ret = SIMD_FUNC(set_pd)(a, b, c, d);
+	inline reg set(const double a, const double b, const double c, const double d){
+		reg ret = SIMD_FUNC(set_pd)(a, b, c, d);
 		return ret;
 	}
 
 	//broadcast
-	inline SIMDreg broadcast(const double a){
-		SIMDreg ret = SIMD_FUNC(broadcast_sd)(&a); 
+	inline reg broadcast(const double a){
+		reg ret = SIMD_FUNC(broadcast_sd)(&a); 
 		return ret;
 	}
 
 	//store
-	inline void store(double& ret, const SIMDreg val){
+	inline void store(double& ret, const reg val){
 		SIMD_FUNC(storeu_pd)((double*)&(ret), val);
 	}
 
 	// add
-	inline SIMDreg add(const SIMDreg a, const SIMDreg b){
+	inline reg add(const reg a, const reg b){
 		return SIMD_FUNC(add_pd)(a, b);
 	}
 
 	// mul
-	inline SIMDreg mul(const SIMDreg a, const SIMDreg b){
+	inline reg mul(const reg a, const reg b){
 		return SIMD_FUNC(mul_pd)(a, b);
 	}
 
 	// sub
-	inline SIMDreg sub(const SIMDreg a, const SIMDreg b){
+	inline reg sub(const reg a, const reg b){
 		return SIMD_FUNC(sub_pd)(a, b);
 	}
 
 	// div
-	inline SIMDreg div(const SIMDreg a, const SIMDreg b){
+	inline reg div(const reg a, const reg b){
 		return SIMD_FUNC(div_pd)(a, b);
 	}
 
 	// fmadd
-	inline SIMDreg fmadd(const SIMDreg a, const SIMDreg b, const SIMDreg c){
+	inline reg fmadd(const reg a, const reg b, const reg c){
 		return SIMD_FUNC(fmadd_pd)(a, b, c);
 	}
 
@@ -77,7 +77,7 @@ namespace ddavx_core{
 	////////////////////////////////////////////
 
 	// need to change for AVX512
-	inline dd_real reduction(SIMDreg a_hi, SIMDreg a_lo){
+	inline dd_real reduction(reg a_hi, reg a_lo){
 		double hi[4];
 		double lo[4];
 		store(*hi, a_hi); 
@@ -92,7 +92,7 @@ namespace ddavx_core{
 	}
 
 	// need to change for AVX512
-	inline void store(double& ret1, double& ret2, double& ret3, double& ret4, const SIMDreg val){
+	inline void store(double& ret1, double& ret2, double& ret3, double& ret4, const reg val){
 		double tmp[4];
 		store(*tmp, val); 
 		ret1 = tmp[0];
@@ -101,7 +101,7 @@ namespace ddavx_core{
 		ret4 = tmp[3];
 	}
 
-	inline void print(SIMDreg a){
+	inline void print(reg a){
 		double tmp[SIMD_Length];
 		store(*tmp, a); 
 		for(int i=0; i<SIMD_Length; i++){
@@ -129,12 +129,12 @@ namespace ddavx_core{
 		ie = ie + is;
 	}
 	inline void to_minus(double& val, registers& regs){
-		SIMDreg reg = load(val);
+		reg reg = load(val);
  		reg = mul(reg, regs.minus);
  		store(val, reg);
 	}
 
-	inline void to_minus(SIMDreg& reg, registers& regs){
+	inline void to_minus(reg& reg, registers& regs){
  		reg = mul(reg, regs.minus);
 	}
 }